• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

nsCharSetProber.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003 *  Copyright (C) 1998 <developer@mozilla.org>
00004 *
00005 *
00006 *  Permission is hereby granted, free of charge, to any person obtaining
00007 *  a copy of this software and associated documentation files (the
00008 *  "Software"), to deal in the Software without restriction, including
00009 *  without limitation the rights to use, copy, modify, merge, publish,
00010 *  distribute, sublicense, and/or sell copies of the Software, and to
00011 *  permit persons to whom the Software is furnished to do so, subject to
00012 *  the following conditions:
00013 *
00014 *  The above copyright notice and this permission notice shall be included 
00015 *  in all copies or substantial portions of the Software.
00016 *
00017 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024 */
00025 
00026 #include "nsCharSetProber.h"
00027 
00028 #include <stdlib.h>
00029 
00030 namespace kencodingprober {
00031 //This filter applies to all scripts which do not use English characters
00032 bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, unsigned int aLen, char** newBuf, unsigned int& newLen)
00033 {
00034   char *newptr;
00035   char *prevPtr, *curPtr;
00036   
00037   bool meetMSB = false;   
00038   newptr = *newBuf = (char*)malloc(aLen);
00039   if (!newptr)
00040     return false;
00041 
00042   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; ++curPtr)
00043   {
00044     if (*curPtr & 0x80)
00045     {
00046       meetMSB = true;
00047     }
00048     else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 
00049     {
00050       //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
00051       if (meetMSB && curPtr > prevPtr) 
00052       //this segment contains more than single symbol, and it has upper ASCII, we need to keep it
00053       {
00054         while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
00055         prevPtr++;
00056         *newptr++ = ' ';
00057         meetMSB = false;
00058       }
00059       else //ignore current segment. (either because it is just a symbol or just an English word)
00060         prevPtr = curPtr+1;
00061     }
00062   }
00063   if (meetMSB && curPtr > prevPtr) 
00064     while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
00065 
00066   newLen = newptr - *newBuf;
00067 
00068   return true;
00069 }
00070 
00071 //This filter applies to all scripts which contain both English characters and upper ASCII characters.
00072 bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, unsigned int aLen, char** newBuf, unsigned int& newLen)
00073 {
00074   //do filtering to reduce load to probers
00075   char *newptr;
00076   char *prevPtr, *curPtr;
00077   bool isInTag = false;
00078 
00079   newptr = *newBuf = (char*)malloc(aLen);
00080   if (!newptr)
00081     return false;
00082 
00083   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; ++curPtr)
00084   {
00085     if (*curPtr == '>')
00086       isInTag = false;
00087     else if (*curPtr == '<')
00088       isInTag = true;
00089 
00090     if (!(*curPtr & 0x80) &&
00091         (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
00092     {
00093       if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 
00094                                         // and it is not inside a tag, keep it.
00095       {
00096         while (prevPtr < curPtr) *newptr++ = *prevPtr++;  
00097         prevPtr++;
00098         *newptr++ = ' ';
00099       }
00100       else
00101         prevPtr = curPtr+1;
00102     }
00103   }
00104 
00105   // If the current segment contains more than just a symbol 
00106   // and it is not inside a tag then keep it.
00107   if (!isInTag)
00108     while (prevPtr < curPtr)
00109       *newptr++ = *prevPtr++;  
00110 
00111   newLen = newptr - *newBuf;
00112 
00113   return true;
00114 }
00115 }
00116 
00117 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal