• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

nsLatin1Prober.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003 *  Copyright (C) 1998 <developer@mozilla.org>
00004 *
00005 *
00006 *  Permission is hereby granted, free of charge, to any person obtaining
00007 *  a copy of this software and associated documentation files (the
00008 *  "Software"), to deal in the Software without restriction, including
00009 *  without limitation the rights to use, copy, modify, merge, publish,
00010 *  distribute, sublicense, and/or sell copies of the Software, and to
00011 *  permit persons to whom the Software is furnished to do so, subject to
00012 *  the following conditions:
00013 *
00014 *  The above copyright notice and this permission notice shall be included 
00015 *  in all copies or substantial portions of the Software.
00016 *
00017 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024 */
00025 
00026 #include "nsLatin1Prober.h"
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029 
00030 #define UDF    0        // undefined
00031 #define OTH    1        //other
00032 #define ASC    2        // ascii capital letter
00033 #define ASS    3        // ascii small letter
00034 #define ACV    4        // accent capital vowel
00035 #define ACO    5        // accent capital other
00036 #define ASV    6        // accent small vowel
00037 #define ASO    7        // accent small other
00038 #define CLASS_NUM   8    // total classes
00039 
00040 namespace kencodingprober {
00041 static unsigned char Latin1_CharToClass[] = 
00042 {
00043   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
00044   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
00045   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
00046   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
00047   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
00048   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
00049   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
00050   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
00051   OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
00052   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
00053   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
00054   ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
00055   OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
00056   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
00057   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
00058   ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
00059   OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
00060   OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
00061   UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
00062   OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
00063   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
00064   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
00065   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
00066   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
00067   ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
00068   ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
00069   ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
00070   ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
00071   ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
00072   ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
00073   ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
00074   ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
00075 };
00076 
00077 
00078 /* 0 : illegal 
00079    1 : very unlikely 
00080    2 : normal 
00081    3 : very likely
00082 */
00083 static unsigned char Latin1ClassModel[] = 
00084 {
00085 /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
00086 /*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
00087 /*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
00088 /*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
00089 /*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
00090 /*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
00091 /*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
00092 /*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
00093 /*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
00094 };
00095 
00096 void  nsLatin1Prober::Reset(void)
00097 {
00098   mState = eDetecting;
00099   mLastCharClass = OTH;
00100   for (int i = 0; i < FREQ_CAT_NUM; i++)
00101     mFreqCounter[i] = 0;
00102 }
00103 
00104 
00105 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, unsigned int aLen)
00106 {
00107   char *newBuf1 = 0;
00108   unsigned int newLen1 = 0;
00109 
00110   if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
00111     newBuf1 = (char*)aBuf;
00112     newLen1 = aLen;
00113   }
00114   
00115   unsigned char charClass;
00116   unsigned char freq;
00117   for (unsigned int i = 0; i < newLen1; i++)
00118   {
00119     charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
00120     freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
00121     if (freq == 0) {
00122       mState = eNotMe;
00123       break;
00124     }
00125     mFreqCounter[freq]++;
00126     mLastCharClass = charClass;
00127   }
00128 
00129   if (newBuf1 != aBuf)
00130     free(newBuf1);
00131 
00132   return mState;
00133 }
00134 
00135 float nsLatin1Prober::GetConfidence(void)
00136 {
00137   if (mState == eNotMe)
00138     return 0.01f;
00139   
00140   float confidence;
00141   unsigned int total = 0;
00142   for (int i = 0; i < FREQ_CAT_NUM; i++)
00143     total += mFreqCounter[i];
00144 
00145   if(!total)
00146     confidence = 0.0f;
00147   else
00148   {
00149     confidence = mFreqCounter[3]*1.0f / total;
00150     confidence -= mFreqCounter[1]*20.0f/total;
00151   }
00152 
00153   if (confidence < 0.0f)
00154     confidence = 0.0f;
00155   
00156   // lower the confidence of latin1 so that other more accurate detector 
00157   // can take priority.
00158   confidence *= 0.50f;
00159 
00160   return confidence;
00161 }
00162 
00163 #ifdef DEBUG_PROBE
00164 void  nsLatin1Prober::DumpStatus()
00165 {
00166   printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
00167 }
00168 #endif
00169 }
00170 
00171 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal