Back to index

lightning-sunbird  0.9+nobinonly
nsSampleWordBreaker.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 
00039 #include "nsSampleWordBreaker.h"
00040 
00041 #include "pratom.h"
00042 #include "nsLWBRKDll.h"
00043 nsSampleWordBreaker::nsSampleWordBreaker()
00044 {
00045 }
00046 nsSampleWordBreaker::~nsSampleWordBreaker()
00047 {
00048 }
00049 
00050 NS_IMPL_ISUPPORTS1(nsSampleWordBreaker, nsIWordBreaker)
00051 
00052 nsresult nsSampleWordBreaker::BreakInBetween(
00053   const PRUnichar* aText1 , PRUint32 aTextLen1,
00054   const PRUnichar* aText2 , PRUint32 aTextLen2,
00055   PRBool *oCanBreak)
00056 {
00057   NS_PRECONDITION( nsnull != aText1, "null ptr");
00058   NS_PRECONDITION( nsnull != aText2, "null ptr");
00059 
00060   if((aText1 == nsnull) || (aText2 == nsnull))
00061     return NS_ERROR_NULL_POINTER; 
00062 
00063   if( (0 == aTextLen1) || (0 == aTextLen2))
00064   {
00065     *oCanBreak = PR_FALSE; 
00066     return NS_OK;
00067   }
00068 
00069   *oCanBreak = (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0]));
00070 
00071   return NS_OK;
00072 }
00073 
00074 
00075 #define IS_ASCII(c)            (0 == ( 0xFF80 & (c)))
00076 #define ASCII_IS_ALPHA(c)         ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
00077 #define ASCII_IS_DIGIT(c)         (( '0' <= (c)) && ((c) <= '9'))
00078 #define ASCII_IS_SPACE(c)         (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c)))
00079 #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) 
00080 
00081 // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect Unicode 3.0 
00082 #define IS_HAN(c)              (( 0x3400 <= (c)) && ((c) <= 0x9fff))||(( 0xf900 <= (c)) && ((c) <= 0xfaff))
00083 #define IS_KATAKANA(c)         (( 0x30A0 <= (c)) && ((c) <= 0x30FF))
00084 #define IS_HIRAGANA(c)         (( 0x3040 <= (c)) && ((c) <= 0x309F))
00085 #define IS_HALFWIDTHKATAKANA(c)         (( 0xFF60 <= (c)) && ((c) <= 0xFF9F))
00086 #define IS_THAI(c)         (0x0E00 == (0xFF80 & (c) )) // Look at the higest 9 bits
00087 
00088 PRUint8 nsSampleWordBreaker::GetClass(PRUnichar c)
00089 {
00090   // begin of the hack
00091 
00092   if (IS_ALPHABETICAL_SCRIPT(c))  {
00093          if(IS_ASCII(c))  {
00094                 if(ASCII_IS_SPACE(c)) {
00095                        return kWbClassSpace;
00096                 } else if(ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c)) {
00097                        return kWbClassAlphaLetter;
00098                 } else {
00099                        return kWbClassPunct;
00100                 }
00101          } else if(IS_THAI(c))     {
00102                 return kWbClassThaiLetter;
00103          } else {
00104                 return kWbClassAlphaLetter;
00105          }
00106   }  else {
00107          if(IS_HAN(c)) {
00108                 return kWbClassHanLetter;
00109          } else if(IS_KATAKANA(c))   {
00110                 return kWbClassKatakanaLetter;
00111          } else if(IS_HIRAGANA(c))   {
00112                 return kWbClassHiraganaLetter;
00113          } else if(IS_HALFWIDTHKATAKANA(c))  {
00114                 return kWbClassHWKatakanaLetter;
00115          } else  {
00116                 return kWbClassAlphaLetter;
00117          }
00118   }
00119   return 0;
00120 }
00121 
00122 nsresult nsSampleWordBreaker::FindWord(
00123   const PRUnichar* aText , PRUint32 aTextLen,
00124   PRUint32 aOffset,
00125   PRUint32 *oWordBegin,
00126   PRUint32 *oWordEnd)
00127 {
00128   NS_PRECONDITION( nsnull != aText, "null ptr");
00129   NS_PRECONDITION( 0 != aTextLen, "len = 0");
00130   NS_PRECONDITION( nsnull != oWordBegin, "null ptr");
00131   NS_PRECONDITION( nsnull != oWordEnd, "null ptr");
00132   NS_PRECONDITION( aOffset <= aTextLen, "aOffset > aTextLen");
00133 
00134   if((nsnull == aText ) || (nsnull == oWordBegin) || (nsnull == oWordEnd))
00135     return NS_ERROR_NULL_POINTER; 
00136   
00137   if( aOffset > aTextLen )
00138     return NS_ERROR_ILLEGAL_VALUE;
00139 
00140 
00141   PRUint8 c = this->GetClass(aText[aOffset]);
00142   PRUint32 i;
00143   // Scan forward
00144   *oWordEnd = aTextLen;
00145   for(i = aOffset +1;i <= aTextLen; i++)
00146   {
00147      if( c != this->GetClass(aText[i]))
00148      {
00149        *oWordEnd = i;
00150        break;
00151      }
00152   }
00153 
00154   // Scan backward
00155   *oWordBegin = 0;
00156   for(i = aOffset ;i > 0; i--)
00157   {
00158      if( c != this->GetClass(aText[i-1]))
00159      {
00160        *oWordBegin = i;
00161        break;
00162      }
00163   }
00164   if(kWbClassThaiLetter == c)
00165   {
00166        // need to call Thai word breaker from here
00167        // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
00168   }
00169   return NS_OK;
00170 }
00171 
00172 nsresult nsSampleWordBreaker::NextWord( 
00173   const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
00174   PRUint32* oNext, PRBool *oNeedMoreText) 
00175 {
00176   PRInt8 c1, c2;
00177   PRUint32 cur = aPos;
00178   c1 = this->GetClass(aText[cur]);
00179  
00180   for(cur++; cur <aLen; cur++)
00181   {
00182      c2 = this->GetClass(aText[cur]);
00183      if(c2 != c1) 
00184        break;
00185   }
00186   if(kWbClassThaiLetter == c1)
00187   {
00188        // need to call Thai word breaker from here
00189        // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
00190   }
00191   *oNext = cur;
00192   *oNeedMoreText = (cur == aLen) ? PR_TRUE : PR_FALSE;
00193   return NS_OK;
00194 }
00195 
00196 nsresult nsSampleWordBreaker::PrevWord(const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
00197   PRUint32* oPrev, PRBool *oNeedMoreText) 
00198 {
00199   PRInt8 c1, c2;
00200   PRUint32 cur = aPos;
00201   c1 = this->GetClass(aText[cur]);
00202 
00203   for(; cur > 0; cur--)
00204   {
00205      c2 = this->GetClass(aText[cur-1]);
00206      if(c2 != c1)
00207        break;
00208   }
00209   if(kWbClassThaiLetter == c1)
00210   {
00211        // need to call Thai word breaker from here
00212        // we should pass the whole Thai segment to the thai word breaker to find a shorter answer
00213   }
00214   *oPrev = cur;
00215   *oNeedMoreText = (cur == 0) ? PR_TRUE : PR_FALSE;
00216   return NS_OK;
00217 }