• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

kencodingprober.cpp

Go to the documentation of this file.
00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 
00021 */
00022 
00023 #include "kencodingprober.h"
00024 
00025 #include "klocale.h"
00026 
00027 #include "probers/nsCharSetProber.h"
00028 #include "probers/nsUniversalDetector.h"
00029 #include "probers/ChineseGroupProber.h"
00030 #include "probers/JapaneseGroupProber.h"
00031 #include "probers/UnicodeGroupProber.h"
00032 #include "probers/nsSBCSGroupProber.h"
00033 #include "probers/nsMBCSGroupProber.h"
00034 
00035 #include <string.h>
00036 
00037 #define MINIMUM_THRESHOLD (float)0.2
00038 
00039 class KEncodingProberPrivate
00040 {
00041 public:
00042     KEncodingProberPrivate(): encoding(strdup("")), prober(NULL), mStart(true) {};
00043     ~KEncodingProberPrivate()
00044     {
00045         delete encoding;
00046         delete prober;
00047     }
00048     void setProberType(KEncodingProber::ProberType pType)
00049     {
00050         proberType = pType;
00051         /* handle multi-byte encodings carefully , because they're hard to detect,
00052         *   and have to use some Stastics methods.
00053         * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
00054         *   because encoding state machine can detect many such encodings.
00055         */ 
00056         switch (proberType) {
00057             case KEncodingProber::None:
00058                 prober = NULL;
00059                 break;
00060             case KEncodingProber::Arabic:
00061             case KEncodingProber::Baltic:
00062             case KEncodingProber::CentralEuropean:
00063             case KEncodingProber::Cyrillic:
00064             case KEncodingProber::Greek:
00065             case KEncodingProber::Hebrew:
00066             case KEncodingProber::NorthernSaami:
00067             case KEncodingProber::Other:
00068             case KEncodingProber::SouthEasternEurope:
00069             case KEncodingProber::Thai:
00070             case KEncodingProber::Turkish:
00071             case KEncodingProber::WesternEuropean:
00072                 prober = new kencodingprober::nsSBCSGroupProber();
00073                 break;
00074             case KEncodingProber::ChineseSimplified:
00075             case KEncodingProber::ChineseTraditional:
00076                 prober = new kencodingprober::ChineseGroupProber();
00077                 break;
00078             case KEncodingProber::Japanese:
00079                 prober = new kencodingprober::JapaneseGroupProber();
00080                 break;
00081             case KEncodingProber::Korean:
00082                 prober = new kencodingprober::nsMBCSGroupProber();
00083                 break;
00084             case KEncodingProber::Unicode:
00085                 prober = new kencodingprober::UnicodeGroupProber();
00086                 break;
00087             case KEncodingProber::Universal:
00088                 prober = new kencodingprober::nsUniversalDetector();
00089                 break;
00090             default:
00091                 prober = NULL;
00092         }
00093     }
00094     void unicodeTest(const char *aBuf, int aLen)
00095     {
00096         if (mStart)
00097         {
00098             mStart = false;
00099             if (aLen > 3)
00100             switch (aBuf[0])
00101             {
00102                 case '\xEF':
00103                     if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00104                     // EF BB BF  UTF-8 encoded BOM
00105                     encoding = "UTF-8";
00106                     break;
00107                 case '\xFE':
00108                     if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00109                         // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
00110                         encoding = "ISO-10646-UCS-4";
00111                     else if ('\xFF' == aBuf[1])
00112                         // FE FF  UTF-16, big endian BOM
00113                         encoding = "UTF-16BE";
00114                         break;
00115                 case '\x00':
00116                     if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00117                         // 00 00 FE FF  UTF-32, big-endian BOM
00118                         encoding = "UTF-32BE";
00119                     else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00120                         // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
00121                         encoding = "ISO-10646-UCS-4";
00122                         break;
00123                 case '\xFF':
00124                     if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00125                         // FF FE 00 00  UTF-32, little-endian BOM
00126                         encoding = "UTF-32LE";
00127                     else if ('\xFE' == aBuf[1])
00128                         // FF FE  UTF-16, little endian BOM
00129                         encoding = "UTF-16LE";
00130                         break;
00131             }  // switch
00132 
00133             if (encoding && strlen(encoding))
00134             {
00135                 proberState = KEncodingProber::FoundIt;
00136                 currentConfidence = 0.99f;
00137             }
00138         }
00139     }
00140     KEncodingProber::ProberType proberType;
00141     KEncodingProber::ProberState proberState;
00142     float currentConfidence;
00143     const char *encoding;
00144     kencodingprober::nsCharSetProber *prober;
00145     bool mStart;
00146 };
00147 
00148 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
00149 {
00150     setProberType(proberType);
00151 }
00152 
00153 KEncodingProber::~KEncodingProber()
00154 {
00155     delete d;
00156 }
00157 
00158 void KEncodingProber::reset()
00159 {
00160     d->proberState = KEncodingProber::Probing;
00161     d->mStart = true;
00162 }
00163 
00164 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
00165 {
00166     return feed(data.data(), data.size());
00167 }
00168 
00169 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
00170 {
00171     if (!d->prober)
00172         return d->proberState;
00173     if (d->proberState == Probing) {
00174         if (d->mStart) {
00175             d->unicodeTest(data, len);
00176             if (d->proberState == FoundIt)
00177                 return d->proberState;
00178         }
00179         d->prober->HandleData(data, len);
00180         switch (d->prober->GetState())
00181         {
00182             case kencodingprober::eNotMe:
00183                 d->proberState = NotMe;
00184                 break;
00185             case kencodingprober::eFoundIt:
00186                 d->proberState = FoundIt;
00187                 break;
00188             default:
00189                 d->proberState = Probing;
00190                 break;
00191         }
00192     }
00193 #ifdef DEBUG_PROBE
00194     d->prober->DumpStatus();
00195 #endif
00196     return d->proberState;
00197 }
00198 
00199 KEncodingProber::ProberState KEncodingProber::state() const
00200 {
00201     return d->proberState;
00202 }
00203 
00204 const char* KEncodingProber::encodingName() const
00205 {
00206     if (!d->prober)
00207         return strdup("UTF-8");
00208 
00209     return strdup(d->prober->GetCharSetName());
00210 }
00211 
00212 float KEncodingProber::confidence() const
00213 {
00214     if (!d->prober)
00215         return 0.0;
00216 
00217     return d->prober->GetConfidence();
00218 }
00219 
00220 KEncodingProber::ProberType KEncodingProber::proberType() const
00221 {
00222     return d->proberType;
00223 }
00224 
00225 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
00226 {
00227     d->setProberType(proberType);
00228     reset();
00229 }
00230 
00231 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
00232 {
00233     if (lang.isEmpty())
00234         return KEncodingProber::Universal;
00235     else if (lang==i18nc("@item Text character set", "Disabled"))
00236         return KEncodingProber::None;
00237     else if (lang==i18nc("@item Text character set", "Universal"))
00238         return KEncodingProber::Universal;
00239     else if (lang==i18nc("@item Text character set", "Unicode"))
00240         return KEncodingProber::Unicode;
00241     else if (lang==i18nc("@item Text character set", "Cyrillic"))
00242         return KEncodingProber::Cyrillic;
00243     else if (lang==i18nc("@item Text character set", "Western European"))
00244         return KEncodingProber::WesternEuropean;
00245     else if (lang==i18nc("@item Text character set", "Central European"))
00246         return KEncodingProber::CentralEuropean;
00247     else if (lang==i18nc("@item Text character set", "Greek"))
00248         return KEncodingProber::Greek;
00249     else if (lang==i18nc("@item Text character set", "Hebrew"))
00250         return KEncodingProber::Hebrew;
00251     else if (lang==i18nc("@item Text character set", "Turkish"))
00252         return KEncodingProber::Turkish;
00253     else if (lang==i18nc("@item Text character set", "Japanese"))
00254         return KEncodingProber::Japanese;
00255     else if (lang==i18nc("@item Text character set", "Baltic"))
00256         return KEncodingProber::Baltic;
00257     else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
00258         return KEncodingProber::ChineseTraditional;
00259     else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
00260         return KEncodingProber::ChineseSimplified;
00261     else if (lang==i18nc("@item Text character set", "Arabic"))
00262         return KEncodingProber::Arabic;
00263 
00264     return KEncodingProber::Universal;
00265 }
00266 
00267 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
00268 {
00269     switch (proberType)
00270     {
00271         case KEncodingProber::None:
00272             return i18nc("@item Text character set", "Disabled");
00273             break;
00274         case KEncodingProber::Universal:
00275             return i18nc("@item Text character set", "Universal");
00276             break;
00277         case KEncodingProber::Arabic:
00278             return i18nc("@item Text character set", "Arabic");
00279             break;
00280         case KEncodingProber::Baltic:
00281             return i18nc("@item Text character set", "Baltic");
00282             break;
00283         case KEncodingProber::CentralEuropean:
00284             return i18nc("@item Text character set", "Central European");
00285             break;
00286         case KEncodingProber::Cyrillic:
00287             return i18nc("@item Text character set", "Cyrillic");
00288             break;
00289         case KEncodingProber::Greek:
00290             return i18nc("@item Text character set", "Greek");
00291             break;
00292         case KEncodingProber::Hebrew:
00293             return i18nc("@item Text character set", "Hebrew");
00294             break;
00295         case KEncodingProber::Japanese:
00296             return i18nc("@item Text character set", "Japanese");
00297             break;
00298         case KEncodingProber::Turkish:
00299             return i18nc("@item Text character set", "Turkish");
00300             break;
00301         case KEncodingProber::WesternEuropean:
00302             return i18nc("@item Text character set", "Western European");
00303             break;
00304         case KEncodingProber::ChineseTraditional:
00305             return i18nc("@item Text character set", "Chinese Traditional");
00306             break;
00307         case KEncodingProber::ChineseSimplified:
00308             return i18nc("@item Text character set", "Chinese Simplified");
00309             break;
00310         case KEncodingProber::Korean:
00311             return i18nc("@item Text character set", "Korean");
00312             break;
00313         case KEncodingProber::Thai:
00314             return i18nc("@item Text character set", "Thai");
00315             break;
00316         case KEncodingProber::Unicode:
00317             return i18nc("@item Text character set", "Unicode");
00318             break;
00319         default:
00320             return QString();
00321         }
00322 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal