KDECore
kencodingprober.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "kencodingprober.h"
00024
00025 #include "klocale.h"
00026
00027 #include "probers/nsCharSetProber.h"
00028 #include "probers/nsUniversalDetector.h"
00029 #include "probers/ChineseGroupProber.h"
00030 #include "probers/JapaneseGroupProber.h"
00031 #include "probers/UnicodeGroupProber.h"
00032 #include "probers/nsSBCSGroupProber.h"
00033 #include "probers/nsMBCSGroupProber.h"
00034
00035 #include <string.h>
00036
00037 #define MINIMUM_THRESHOLD (float)0.2
00038
00039 class KEncodingProberPrivate
00040 {
00041 public:
00042 KEncodingProberPrivate(): encoding(strdup("")), prober(NULL), mStart(true) {};
00043 ~KEncodingProberPrivate()
00044 {
00045 delete encoding;
00046 delete prober;
00047 }
00048 void setProberType(KEncodingProber::ProberType pType)
00049 {
00050 proberType = pType;
00051
00052
00053
00054
00055
00056 switch (proberType) {
00057 case KEncodingProber::None:
00058 prober = NULL;
00059 break;
00060 case KEncodingProber::Arabic:
00061 case KEncodingProber::Baltic:
00062 case KEncodingProber::CentralEuropean:
00063 case KEncodingProber::Cyrillic:
00064 case KEncodingProber::Greek:
00065 case KEncodingProber::Hebrew:
00066 case KEncodingProber::NorthernSaami:
00067 case KEncodingProber::Other:
00068 case KEncodingProber::SouthEasternEurope:
00069 case KEncodingProber::Thai:
00070 case KEncodingProber::Turkish:
00071 case KEncodingProber::WesternEuropean:
00072 prober = new kencodingprober::nsSBCSGroupProber();
00073 break;
00074 case KEncodingProber::ChineseSimplified:
00075 case KEncodingProber::ChineseTraditional:
00076 prober = new kencodingprober::ChineseGroupProber();
00077 break;
00078 case KEncodingProber::Japanese:
00079 prober = new kencodingprober::JapaneseGroupProber();
00080 break;
00081 case KEncodingProber::Korean:
00082 prober = new kencodingprober::nsMBCSGroupProber();
00083 break;
00084 case KEncodingProber::Unicode:
00085 prober = new kencodingprober::UnicodeGroupProber();
00086 break;
00087 case KEncodingProber::Universal:
00088 prober = new kencodingprober::nsUniversalDetector();
00089 break;
00090 default:
00091 prober = NULL;
00092 }
00093 }
00094 void unicodeTest(const char *aBuf, int aLen)
00095 {
00096 if (mStart)
00097 {
00098 mStart = false;
00099 if (aLen > 3)
00100 switch (aBuf[0])
00101 {
00102 case '\xEF':
00103 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00104
00105 encoding = "UTF-8";
00106 break;
00107 case '\xFE':
00108 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00109
00110 encoding = "ISO-10646-UCS-4";
00111 else if ('\xFF' == aBuf[1])
00112
00113 encoding = "UTF-16BE";
00114 break;
00115 case '\x00':
00116 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00117
00118 encoding = "UTF-32BE";
00119 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00120
00121 encoding = "ISO-10646-UCS-4";
00122 break;
00123 case '\xFF':
00124 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00125
00126 encoding = "UTF-32LE";
00127 else if ('\xFE' == aBuf[1])
00128
00129 encoding = "UTF-16LE";
00130 break;
00131 }
00132
00133 if (encoding && strlen(encoding))
00134 {
00135 proberState = KEncodingProber::FoundIt;
00136 currentConfidence = 0.99f;
00137 }
00138 }
00139 }
00140 KEncodingProber::ProberType proberType;
00141 KEncodingProber::ProberState proberState;
00142 float currentConfidence;
00143 const char *encoding;
00144 kencodingprober::nsCharSetProber *prober;
00145 bool mStart;
00146 };
00147
00148 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
00149 {
00150 setProberType(proberType);
00151 }
00152
00153 KEncodingProber::~KEncodingProber()
00154 {
00155 delete d;
00156 }
00157
00158 void KEncodingProber::reset()
00159 {
00160 d->proberState = KEncodingProber::Probing;
00161 d->mStart = true;
00162 }
00163
00164 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
00165 {
00166 return feed(data.data(), data.size());
00167 }
00168
00169 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
00170 {
00171 if (!d->prober)
00172 return d->proberState;
00173 if (d->proberState == Probing) {
00174 if (d->mStart) {
00175 d->unicodeTest(data, len);
00176 if (d->proberState == FoundIt)
00177 return d->proberState;
00178 }
00179 d->prober->HandleData(data, len);
00180 switch (d->prober->GetState())
00181 {
00182 case kencodingprober::eNotMe:
00183 d->proberState = NotMe;
00184 break;
00185 case kencodingprober::eFoundIt:
00186 d->proberState = FoundIt;
00187 break;
00188 default:
00189 d->proberState = Probing;
00190 break;
00191 }
00192 }
00193 #ifdef DEBUG_PROBE
00194 d->prober->DumpStatus();
00195 #endif
00196 return d->proberState;
00197 }
00198
00199 KEncodingProber::ProberState KEncodingProber::state() const
00200 {
00201 return d->proberState;
00202 }
00203
00204 const char* KEncodingProber::encodingName() const
00205 {
00206 if (!d->prober)
00207 return strdup("UTF-8");
00208
00209 return strdup(d->prober->GetCharSetName());
00210 }
00211
00212 float KEncodingProber::confidence() const
00213 {
00214 if (!d->prober)
00215 return 0.0;
00216
00217 return d->prober->GetConfidence();
00218 }
00219
00220 KEncodingProber::ProberType KEncodingProber::proberType() const
00221 {
00222 return d->proberType;
00223 }
00224
00225 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
00226 {
00227 d->setProberType(proberType);
00228 reset();
00229 }
00230
00231 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
00232 {
00233 if (lang.isEmpty())
00234 return KEncodingProber::Universal;
00235 else if (lang==i18nc("@item Text character set", "Disabled"))
00236 return KEncodingProber::None;
00237 else if (lang==i18nc("@item Text character set", "Universal"))
00238 return KEncodingProber::Universal;
00239 else if (lang==i18nc("@item Text character set", "Unicode"))
00240 return KEncodingProber::Unicode;
00241 else if (lang==i18nc("@item Text character set", "Cyrillic"))
00242 return KEncodingProber::Cyrillic;
00243 else if (lang==i18nc("@item Text character set", "Western European"))
00244 return KEncodingProber::WesternEuropean;
00245 else if (lang==i18nc("@item Text character set", "Central European"))
00246 return KEncodingProber::CentralEuropean;
00247 else if (lang==i18nc("@item Text character set", "Greek"))
00248 return KEncodingProber::Greek;
00249 else if (lang==i18nc("@item Text character set", "Hebrew"))
00250 return KEncodingProber::Hebrew;
00251 else if (lang==i18nc("@item Text character set", "Turkish"))
00252 return KEncodingProber::Turkish;
00253 else if (lang==i18nc("@item Text character set", "Japanese"))
00254 return KEncodingProber::Japanese;
00255 else if (lang==i18nc("@item Text character set", "Baltic"))
00256 return KEncodingProber::Baltic;
00257 else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
00258 return KEncodingProber::ChineseTraditional;
00259 else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
00260 return KEncodingProber::ChineseSimplified;
00261 else if (lang==i18nc("@item Text character set", "Arabic"))
00262 return KEncodingProber::Arabic;
00263
00264 return KEncodingProber::Universal;
00265 }
00266
00267 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
00268 {
00269 switch (proberType)
00270 {
00271 case KEncodingProber::None:
00272 return i18nc("@item Text character set", "Disabled");
00273 break;
00274 case KEncodingProber::Universal:
00275 return i18nc("@item Text character set", "Universal");
00276 break;
00277 case KEncodingProber::Arabic:
00278 return i18nc("@item Text character set", "Arabic");
00279 break;
00280 case KEncodingProber::Baltic:
00281 return i18nc("@item Text character set", "Baltic");
00282 break;
00283 case KEncodingProber::CentralEuropean:
00284 return i18nc("@item Text character set", "Central European");
00285 break;
00286 case KEncodingProber::Cyrillic:
00287 return i18nc("@item Text character set", "Cyrillic");
00288 break;
00289 case KEncodingProber::Greek:
00290 return i18nc("@item Text character set", "Greek");
00291 break;
00292 case KEncodingProber::Hebrew:
00293 return i18nc("@item Text character set", "Hebrew");
00294 break;
00295 case KEncodingProber::Japanese:
00296 return i18nc("@item Text character set", "Japanese");
00297 break;
00298 case KEncodingProber::Turkish:
00299 return i18nc("@item Text character set", "Turkish");
00300 break;
00301 case KEncodingProber::WesternEuropean:
00302 return i18nc("@item Text character set", "Western European");
00303 break;
00304 case KEncodingProber::ChineseTraditional:
00305 return i18nc("@item Text character set", "Chinese Traditional");
00306 break;
00307 case KEncodingProber::ChineseSimplified:
00308 return i18nc("@item Text character set", "Chinese Simplified");
00309 break;
00310 case KEncodingProber::Korean:
00311 return i18nc("@item Text character set", "Korean");
00312 break;
00313 case KEncodingProber::Thai:
00314 return i18nc("@item Text character set", "Thai");
00315 break;
00316 case KEncodingProber::Unicode:
00317 return i18nc("@item Text character set", "Unicode");
00318 break;
00319 default:
00320 return QString();
00321 }
00322 }