• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

nsHebrewProber.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003 *  Copyright (C) 1998 <developer@mozilla.org>
00004 *
00005 *
00006 *  Permission is hereby granted, free of charge, to any person obtaining
00007 *  a copy of this software and associated documentation files (the
00008 *  "Software"), to deal in the Software without restriction, including
00009 *  without limitation the rights to use, copy, modify, merge, publish,
00010 *  distribute, sublicense, and/or sell copies of the Software, and to
00011 *  permit persons to whom the Software is furnished to do so, subject to
00012 *  the following conditions:
00013 *
00014 *  The above copyright notice and this permission notice shall be included 
00015 *  in all copies or substantial portions of the Software.
00016 *
00017 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024 */
00025 
00026 #include "nsHebrewProber.h"
00027 #include <stdio.h>
00028 
00029 // windows-1255 / ISO-8859-8 code points of interest
00030 #define FINAL_KAF ('\xea')
00031 #define NORMAL_KAF ('\xeb')
00032 #define FINAL_MEM ('\xed')
00033 #define NORMAL_MEM ('\xee')
00034 #define FINAL_NUN ('\xef')
00035 #define NORMAL_NUN ('\xf0')
00036 #define FINAL_PE ('\xf3')
00037 #define NORMAL_PE ('\xf4')
00038 #define FINAL_TSADI ('\xf5')
00039 #define NORMAL_TSADI ('\xf6')
00040 
00041 // Minimum Visual vs Logical final letter score difference.
00042 // If the difference is below this, don't rely solely on the final letter score distance.
00043 #define MIN_FINAL_CHAR_DISTANCE (5)
00044 
00045 // Minimum Visual vs Logical model score difference.
00046 // If the difference is below this, don't rely at all on the model score distance.
00047 #define MIN_MODEL_DISTANCE (0.01)
00048 
00049 #define VISUAL_HEBREW_NAME ("ISO-8859-8")
00050 #define LOGICAL_HEBREW_NAME ("windows-1255")
00051 
00052 namespace kencodingprober {
00053 bool nsHebrewProber::isFinal(char c)
00054 {
00055   return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
00056 }
00057 
00058 bool nsHebrewProber::isNonFinal(char c)
00059 {
00060   return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
00061   // The normal Tsadi is not a good Non-Final letter due to words like 
00062   // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
00063   // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
00064   // the Non-Final tsadi to appear at an end of a word even though this is not 
00065   // the case in the original text.
00066   // The letters Pe and Kaf rarely display a related behavior of not being a 
00067   // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
00068   // example legally end with a Non-Final Pe or Kaf. However, the benefit of 
00069   // these letters as Non-Final letters outweighs the damage since these words 
00070   // are quite rare.
00071 }
00072 
00098 nsProbingState nsHebrewProber::HandleData(const char* aBuf, unsigned int aLen)
00099 {
00100   // Both model probers say it's not them. No reason to continue.
00101   if (GetState() == eNotMe)
00102     return eNotMe;
00103 
00104   const char *curPtr, *endPtr = aBuf+aLen;
00105   char cur;
00106 
00107   for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr)
00108   {
00109     cur = *curPtr;
00110     if (cur == ' ') // We stand on a space - a word just ended
00111     {
00112       if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word
00113       {
00114         if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space]
00115           ++mFinalCharLogicalScore;
00116         else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space]
00117           ++mFinalCharVisualScore;
00118       }
00119     }
00120     else  // Not standing on a space
00121     {
00122       if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space]
00123         ++mFinalCharVisualScore;
00124     }
00125     mBeforePrev = mPrev;
00126     mPrev = cur;
00127   }
00128 
00129   // Forever detecting, till the end or until both model probers return eNotMe (handled above).
00130   return eDetecting;
00131 }
00132 
00133 // Make the decision: is it Logical or Visual?
00134 const char* nsHebrewProber::GetCharSetName()
00135 {
00136   // If the final letter score distance is dominant enough, rely on it.
00137   int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
00138   if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 
00139     return LOGICAL_HEBREW_NAME;
00140   if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
00141     return VISUAL_HEBREW_NAME;
00142 
00143   // It's not dominant enough, try to rely on the model scores instead.
00144   float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
00145   if (modelsub > MIN_MODEL_DISTANCE)
00146     return LOGICAL_HEBREW_NAME;
00147   if (modelsub < -(MIN_MODEL_DISTANCE))
00148     return VISUAL_HEBREW_NAME;
00149 
00150   // Still no good, back to final letter distance, maybe it'll save the day.
00151   if (finalsub < 0) 
00152     return VISUAL_HEBREW_NAME;
00153 
00154   // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
00155   return LOGICAL_HEBREW_NAME;
00156 }
00157 
00158 
00159 void nsHebrewProber::Reset(void)
00160 {
00161   mFinalCharLogicalScore = 0;
00162   mFinalCharVisualScore = 0;
00163 
00164   // mPrev and mBeforePrev are initialized to space in order to simulate a word 
00165   // delimiter at the beginning of the data
00166   mPrev = ' ';
00167   mBeforePrev = ' ';
00168 }
00169 
00170 nsProbingState nsHebrewProber::GetState(void) 
00171 {
00172   // Remain active as long as any of the model probers are active.
00173   if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe))
00174     return eNotMe;
00175   return eDetecting;
00176 }
00177 
00178 #ifdef DEBUG_PROBE
00179 void  nsHebrewProber::DumpStatus()
00180 {
00181   printf("  HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
00182 }
00183 #endif
00184 }
00185 
00186 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal