KDECore
UnicodeGroupProber.cpp
Go to the documentation of this file.00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 2008 <wkai@gmail.com> 00004 * 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "UnicodeGroupProber.h" 00027 00028 #include "ctype_test_p.h" 00029 00030 #include <QtAlgorithms> 00031 #include <math.h> 00032 00033 namespace kencodingprober { 00034 UnicodeGroupProber::UnicodeGroupProber(void) 00035 { 00036 mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel); 00037 mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel); 00038 mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel); 00039 mActiveSM = NUM_OF_UNICODE_CHARSETS; 00040 mState = eDetecting; 00041 mDetectedCharset = "UTF-8"; 00042 } 00043 00044 UnicodeGroupProber::~UnicodeGroupProber(void) 00045 { 00046 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) 00047 delete mCodingSM[i]; 00048 } 00049 00050 void UnicodeGroupProber::Reset(void) 00051 { 00052 mState = eDetecting; 00053 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) 00054 mCodingSM[i]->Reset(); 00055 mActiveSM = NUM_OF_UNICODE_CHARSETS; 00056 mDetectedCharset = "UTF-8"; 00057 } 00058 00059 nsProbingState UnicodeGroupProber::HandleData(const char* aBuf, unsigned int aLen) 00060 { 00061 nsSMState codingState; 00062 int j; 00063 uint i, weight_BOM, counts[5]; 00064 static bool disableUTF16LE = false; 00065 static bool disableUTF16BE = false; 00066 double weight_zero; 00067 00068 if (mActiveSM <= 0) { 00069 mState = eNotMe; 00070 return mState; 00071 } 00072 00073 if (! (disableUTF16LE || disableUTF16BE)) { 00074 if (aLen%2 != 0) { 00075 disableUTF16LE = true; 00076 disableUTF16BE = true; 00077 } 00078 weight_BOM = (uint)(sqrt((double)aLen) + aLen/10.0); 00079 for (uint i = 0; i < 5; i++) 00080 qCount(aBuf, aBuf+aLen, i, counts[i]); 00081 weight_zero = (2.0*(counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM)/aLen; 00082 if (weight_zero < log(1.4142)) { 00083 disableUTF16LE = true; 00084 disableUTF16BE = true; 00085 } 00086 if (4 >= aBuf[1] && aBuf[1] >= 0 && isprint(aBuf[0])) 00087 disableUTF16BE = true; 00088 else 00089 disableUTF16LE = true; 00090 if (disableUTF16BE) 00091 mActiveSM--; 00092 if (disableUTF16LE) { 00093 nsCodingStateMachine* t; 00094 t = mCodingSM[1]; 00095 mCodingSM[1] = mCodingSM[2]; 00096 mCodingSM[2] = t; 00097 mActiveSM--; 00098 } 00099 } 00100 00101 for (i = 0; i < aLen; ++i) { 00102 for (j = mActiveSM-1; j>= 0; --j) 00103 { 00104 //byte is feed to all active state machine 00105 codingState = mCodingSM[j]->NextState(aBuf[i]); 00106 if (codingState == eError) 00107 { 00108 //got negative answer for this state machine, make it inactive 00109 mActiveSM--; 00110 if (mActiveSM == 0) 00111 { 00112 mState = eNotMe; 00113 return mState; 00114 } 00115 else if (j != (int)mActiveSM) 00116 { 00117 nsCodingStateMachine* t; 00118 t = mCodingSM[mActiveSM]; 00119 mCodingSM[mActiveSM] = mCodingSM[j]; 00120 mCodingSM[j] = t; 00121 } 00122 } 00123 else if (codingState == eItsMe) 00124 { 00125 mState = eFoundIt; 00126 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 00127 return mState; 00128 } else if (mState == eDetecting) 00129 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();; 00130 } 00131 } 00132 return mState; 00133 } 00134 00135 float UnicodeGroupProber::GetConfidence() 00136 { 00137 if (mState == eFoundIt) 00138 return 0.99f; 00139 else 00140 return 0.0f; 00141 } 00142 00143 #ifdef DEBUG_PROBE 00144 void UnicodeGroupProber::DumpStatus() 00145 { 00146 GetConfidence(); 00147 for (uint i = 0; i < mActiveSM; i++) 00148 { 00149 kDebug(180) << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine() ; 00150 } 00151 } 00152 #endif 00153 00154 } 00155 00156