Back to index

lightning-sunbird  0.9+nobinonly
mozEnglishWordUtils.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Spellchecker Component.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * David Einstein.
00019  * Portions created by the Initial Developer are Copyright (C) 2001
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s): David Einstein Deinst@world.std.com
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either the GNU General Public License Version 2 or later (the "GPL"), or
00026  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 #include "mozEnglishWordUtils.h"
00039 #include "nsICharsetAlias.h"
00040 #include "nsReadableUtils.h"
00041 #include "nsIServiceManager.h"
00042 #include "nsUnicharUtilCIID.h"
00043 #include "nsCRT.h"
00044 #include "cattable.h"
00045 
00046 static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID); 
00047 
00048 NS_IMPL_ISUPPORTS1(mozEnglishWordUtils, mozISpellI18NUtil)
00049 
00050 mozEnglishWordUtils::mozEnglishWordUtils()
00051 {
00052   mLanguage.AssignLiteral("en");
00053 
00054   nsresult rv;
00055   mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID, &rv);
00056 }
00057 
00058 mozEnglishWordUtils::~mozEnglishWordUtils()
00059 {
00060 }
00061 
00062 /* attribute wstring language; */
00063 NS_IMETHODIMP mozEnglishWordUtils::GetLanguage(PRUnichar * *aLanguage)
00064 {
00065   nsresult rv = NS_OK;
00066   NS_ENSURE_ARG_POINTER(aLanguage);
00067 
00068   *aLanguage = ToNewUnicode(mLanguage);
00069   if(!aLanguage) rv = NS_ERROR_OUT_OF_MEMORY;
00070   return rv;
00071  }
00072 
00073 /* void GetRootForm (in wstring aWord, in PRUint32 type, [array, size_is (count)] out wstring words, out PRUint32 count); */
00074 // return the possible root forms of aWord.
00075 NS_IMETHODIMP mozEnglishWordUtils::GetRootForm(const PRUnichar *aWord, PRUint32 type, PRUnichar ***words, PRUint32 *count)
00076 {
00077   nsAutoString word(aWord);
00078   PRUnichar **tmpPtr;
00079   PRInt32 length = word.Length();
00080 
00081   *count = 0;
00082 
00083   if (!mCaseConv) {
00084     mCaseConv = do_GetService(kUnicharUtilCID);
00085     if (!mCaseConv)
00086       return NS_ERROR_FAILURE;
00087   }
00088 
00089   mozEnglishWordUtils::myspCapitalization ct = captype(word);
00090   switch (ct)
00091     {
00092     case HuhCap:
00093     case NoCap: 
00094       tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *));
00095       if (!tmpPtr)
00096         return NS_ERROR_OUT_OF_MEMORY;
00097       tmpPtr[0] = ToNewUnicode(word);
00098       if (!tmpPtr[0]) {
00099         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
00100         return NS_ERROR_OUT_OF_MEMORY;
00101       }
00102       *words = tmpPtr;
00103       *count = 1;
00104       break;
00105     
00106 
00107     case AllCap:
00108       tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 3);
00109       if (!tmpPtr)
00110         return NS_ERROR_OUT_OF_MEMORY;
00111       tmpPtr[0] = ToNewUnicode(word);
00112       if (!tmpPtr[0]) {
00113         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
00114         return NS_ERROR_OUT_OF_MEMORY;
00115       }
00116       mCaseConv->ToLower(tmpPtr[0], tmpPtr[0], length);
00117 
00118       tmpPtr[1] = ToNewUnicode(word);
00119       if (!tmpPtr[1]) {
00120         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
00121         return NS_ERROR_OUT_OF_MEMORY;
00122       }
00123       mCaseConv->ToLower(tmpPtr[1], tmpPtr[1], length);
00124       mCaseConv->ToUpper(tmpPtr[1], tmpPtr[1], 1);
00125 
00126       tmpPtr[2] = ToNewUnicode(word);
00127       if (!tmpPtr[2]) {
00128         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(2, tmpPtr);
00129         return NS_ERROR_OUT_OF_MEMORY;
00130       }
00131 
00132       *words = tmpPtr;
00133       *count = 3;
00134       break;
00135  
00136     case InitCap:  
00137       tmpPtr = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *) * 2);
00138       if (!tmpPtr)
00139         return NS_ERROR_OUT_OF_MEMORY;
00140 
00141       tmpPtr[0] = ToNewUnicode(word);
00142       if (!tmpPtr[0]) {
00143         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(0, tmpPtr);
00144         return NS_ERROR_OUT_OF_MEMORY;
00145       }
00146       mCaseConv->ToLower(tmpPtr[0], tmpPtr[0], length);
00147 
00148       tmpPtr[1] = ToNewUnicode(word);
00149       if (!tmpPtr[1]) {
00150         NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(1, tmpPtr);
00151         return NS_ERROR_OUT_OF_MEMORY;
00152       }
00153 
00154       *words = tmpPtr;
00155       *count = 2;
00156       break;
00157     default:
00158       return NS_ERROR_FAILURE; // should never get here;
00159     }
00160   return NS_OK;
00161 }
00162 
00163 // This needs vast improvement
00164 static PRBool ucIsAlpha(PRUnichar c)
00165 {
00166   // XXX we have to fix callers to handle the full Unicode range
00167   return (5 == GetCat(PRUint32(c)));
00168 }
00169 
00170 /* void FindNextWord (in wstring word, in PRUint32 length, in PRUint32 offset, out PRUint32 begin, out PRUint32 end); */
00171 NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const PRUnichar *word, PRUint32 length, PRUint32 offset, PRInt32 *begin, PRInt32 *end)
00172 {
00173   const PRUnichar *p = word + offset;
00174   const PRUnichar *endbuf = word + length;
00175   const PRUnichar *startWord=p;
00176   if(p<endbuf){
00177     // XXX These loops should be modified to handle non-BMP characters.
00178     // if previous character is a word character, need to advance out of the word
00179     if (offset > 0 && ucIsAlpha(*(p-1))) {
00180       while (p < endbuf && ucIsAlpha(*p))
00181         p++;
00182     }
00183     while((p < endbuf) && (!ucIsAlpha(*p)))
00184       {
00185         p++;
00186       }
00187     startWord=p;
00188     while((p < endbuf) && ((ucIsAlpha(*p))||(*p=='\'')))
00189       { 
00190         p++;
00191       }
00192     
00193     // we could be trying to break down a url, we don't want to break a url into parts,
00194     // instead we want to find out if it really is a url and if so, skip it, advancing startWord 
00195     // to a point after the url.
00196 
00197     // before we spend more time looking to see if the word is a url, look for a url identifer
00198     // and make sure that identifer isn't the last character in the word fragment.
00199     if ( (*p == ':' || *p == '@' || *p == '.') &&  p < endbuf - 1) {
00200 
00201         // ok, we have a possible url...do more research to find out if we really have one
00202         // and determine the length of the url so we can skip over it.
00203        
00204         if (mURLDetector)
00205         {
00206           PRInt32 startPos = -1;
00207           PRInt32 endPos = -1;        
00208 
00209           mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, p - startWord, &startPos, &endPos);
00210 
00211           // ok, if we got a url, adjust the array bounds, skip the current url text and find the next word again
00212           if (startPos != -1 && endPos != -1) { 
00213             startWord = p + endPos + 1; // skip over the url
00214             p = startWord; // reset p
00215 
00216             // now recursively call FindNextWord to search for the next word now that we have skipped the url
00217             return FindNextWord(word, length, startWord - word, begin, end);
00218           }
00219         }
00220     }
00221 
00222     while((p > startWord)&&(*(p-1) == '\'')){  // trim trailing apostrophes
00223       p--;
00224     }
00225   }
00226   else{
00227     startWord = endbuf;
00228   }
00229   if(startWord == endbuf){
00230     *begin = -1;
00231     *end = -1;
00232   }
00233   else{
00234     *begin = startWord-word;
00235     *end = p-word;
00236   }
00237   return NS_OK;
00238 }
00239 
00240 mozEnglishWordUtils::myspCapitalization 
00241 mozEnglishWordUtils::captype(const nsString &word)
00242 {
00243   if(!mCaseConv) return HuhCap; //punt
00244   PRUnichar* lword=ToNewUnicode(word);  
00245   mCaseConv->ToUpper(lword,lword,word.Length());
00246   if(word.Equals(lword)){
00247     nsMemory::Free(lword);
00248     return AllCap;
00249   }
00250 
00251   mCaseConv->ToLower(lword,lword,word.Length());
00252   if(word.Equals(lword)){
00253     nsMemory::Free(lword);
00254     return NoCap;
00255   }
00256   PRInt32 length=word.Length();
00257   if(Substring(word,1,length-1).Equals(lword+1)){
00258     nsMemory::Free(lword);
00259     return InitCap;
00260   }
00261   nsMemory::Free(lword);
00262   return HuhCap;
00263 }
00264 
00265 // Convert the list of words in iwords to the same capitalization aWord and 
00266 // return them in owords.
00267 NS_IMETHODIMP mozEnglishWordUtils::FromRootForm(const PRUnichar *aWord, const PRUnichar **iwords, PRUint32 icount, PRUnichar ***owords, PRUint32 *ocount)
00268 {
00269   nsAutoString word(aWord);
00270   nsresult rv = NS_OK;
00271 
00272   PRInt32 length;
00273   PRUnichar **tmpPtr  = (PRUnichar **)nsMemory::Alloc(sizeof(PRUnichar *)*icount);
00274   if (!tmpPtr)
00275     return NS_ERROR_OUT_OF_MEMORY;
00276 
00277   mozEnglishWordUtils::myspCapitalization ct = captype(word);
00278   for(PRUint32 i = 0; i < icount; ++i) {
00279     length = nsCRT::strlen(iwords[i]);
00280     tmpPtr[i] = (PRUnichar *) nsMemory::Alloc(sizeof(PRUnichar) * (length + 1));
00281     if (NS_UNLIKELY(!tmpPtr[i])) {
00282       NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(i, tmpPtr);
00283       return NS_ERROR_OUT_OF_MEMORY;
00284     }
00285     memcpy(tmpPtr[i], iwords[i], (length + 1) * sizeof(PRUnichar));
00286 
00287     nsAutoString capTest(tmpPtr[i]);
00288     mozEnglishWordUtils::myspCapitalization newCt=captype(capTest);
00289     if(newCt == NoCap){
00290       switch(ct) 
00291         {
00292         case HuhCap:
00293         case NoCap:
00294           break;
00295         case AllCap:
00296           rv = mCaseConv->ToUpper(tmpPtr[i],tmpPtr[i],length);
00297           break;
00298         case InitCap:  
00299           rv = mCaseConv->ToUpper(tmpPtr[i],tmpPtr[i],1);
00300           break;
00301         default:
00302           rv = NS_ERROR_FAILURE; // should never get here;
00303           break;
00304 
00305         }
00306     }
00307   }
00308   if (NS_SUCCEEDED(rv)){
00309     *owords = tmpPtr;
00310     *ocount = icount;
00311   }
00312   return rv;
00313 }
00314