Back to index

lightning-sunbird  0.9+nobinonly
mozInlineSpellWordUtil.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is inline spellchecker code.
00016  *
00017  * The Initial Developer of the Original Code is Google Inc.
00018  * Portions created by the Initial Developer are Copyright (C) 2004-2006
00019  * the Initial Developer. All Rights Reserved.
00020  *
00021  * Contributor(s):
00022  *   Brett Wilson <brettw@gmail.com> (original author)
00023  *   Robert O'Callahan <rocallahan@novell.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either the GNU General Public License Version 2 or later (the "GPL"), or
00027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 #include "cattable.h"
00040 #include "mozInlineSpellWordUtil.h"
00041 #include "nsDebug.h"
00042 #include "nsIAtom.h"
00043 #include "nsComponentManagerUtils.h"
00044 #include "nsIDOMCSSStyleDeclaration.h"
00045 #include "nsIDOMDocumentView.h"
00046 #include "nsIDOMElement.h"
00047 #include "nsIDOMNSRange.h"
00048 #include "nsIDOMRange.h"
00049 #include "nsIEditor.h"
00050 #include "nsIDOMNode.h"
00051 #include "nsIDOMHTMLBRElement.h"
00052 
00053 // some character categories we care about from GetCat()
00054 #define CHAR_CAT_NUMBER 2
00055 #define CHAR_CAT_SPACE 3
00056 #define CHAR_CAT_CONTROL 4
00057 #define CHAR_CAT_WORD 5
00058 #define CHAR_CAT_PUNCTUATION1 6
00059 #define CHAR_CAT_PUNCTUATION2 7
00060 
00061 // IsIgnorableCharacter
00062 //
00063 //    These characters are ones that we should ignore in input.
00064 
00065 inline PRBool IsIgnorableCharacter(PRUnichar ch)
00066 {
00067   return (ch == 0x200D || // ZERO-WIDTH JOINER
00068           ch == 0xAD ||   // SOFT HYPHEN
00069           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
00070 }
00071 
00072 // IsConditionalPunctuation
00073 //
00074 //    Some characters (like apostrophes) require characters on each side to be
00075 //    part of a word, and are otherwise punctuation.
00076 
00077 inline PRBool IsConditionalPunctuation(PRUnichar ch)
00078 {
00079   return (ch == '\'' ||
00080           ch == 0x2019); // RIGHT SINGLE QUOTATION MARK
00081 }
00082 
00083 // mozInlineSpellWordUtil::Init
00084 
00085 nsresult
00086 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
00087 {
00088   nsresult rv;
00089 
00090   // getting the editor can fail commonly because the editor was detached, so
00091   // don't assert
00092   nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
00093   if (NS_FAILED(rv))
00094     return rv;
00095 
00096   nsCOMPtr<nsIDOMDocument> domDoc;
00097   rv = editor->GetDocument(getter_AddRefs(domDoc));
00098   NS_ENSURE_SUCCESS(rv, rv);
00099 
00100   mDocument = do_QueryInterface(domDoc, &rv);
00101   NS_ENSURE_SUCCESS(rv, rv);
00102 
00103   mDOMDocumentRange = do_QueryInterface(domDoc, &rv);
00104   NS_ENSURE_SUCCESS(rv, rv);
00105 
00106   // view
00107   nsCOMPtr<nsIDOMDocumentView> docView = do_QueryInterface(domDoc, &rv);
00108   NS_ENSURE_SUCCESS(rv, rv);
00109   nsCOMPtr<nsIDOMAbstractView> abstractView;
00110   rv = docView->GetDefaultView(getter_AddRefs(abstractView));
00111   NS_ENSURE_SUCCESS(rv, rv);
00112   mCSSView = do_QueryInterface(abstractView, &rv);
00113   NS_ENSURE_SUCCESS(rv, rv);
00114 
00115   // Find the root node for the editor. For contenteditable we'll need something
00116   // cleverer here.
00117   nsCOMPtr<nsIDOMElement> rootElt;
00118   rv = editor->GetRootElement(getter_AddRefs(rootElt));
00119   NS_ENSURE_SUCCESS(rv, rv);
00120   
00121   mRootNode = rootElt;
00122   NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
00123   return NS_OK;
00124 }
00125 
00126 static PRBool
00127 IsTextNode(nsIDOMNode* aNode)
00128 {
00129   PRUint16 type = 0;
00130   aNode->GetNodeType(&type);
00131   return type == nsIDOMNode::TEXT_NODE;
00132 }
00133 
00134 typedef void (* OnLeaveNodeFunPtr)(nsIDOMNode* aNode, void* aClosure);
00135 
00136 // Find the next node in the DOM tree in preorder. This isn't fast because
00137 // one call to GetNextSibling can be O(N) in the number of siblings...
00138 // Calls OnLeaveNodeFunPtr when the traversal leaves a node
00139 static nsIDOMNode*
00140 FindNextNode(nsIDOMNode* aNode, nsIDOMNode* aRoot,
00141              OnLeaveNodeFunPtr aOnLeaveNode = nsnull, void* aClosure = nsnull)
00142 {
00143   NS_PRECONDITION(aNode, "Null starting node?");
00144 
00145   nsCOMPtr<nsIDOMNode> next;
00146   aNode->GetFirstChild(getter_AddRefs(next));
00147   if (next)
00148     return next;
00149   
00150   // Don't look at siblings or otherwise outside of aRoot
00151   if (aNode == aRoot)
00152     return nsnull;
00153 
00154   aNode->GetNextSibling(getter_AddRefs(next));
00155   if (next)
00156     return next;
00157 
00158   // Go up
00159   for (;;) {
00160     if (aOnLeaveNode) {
00161       aOnLeaveNode(aNode, aClosure);
00162     }
00163     
00164     aNode->GetParentNode(getter_AddRefs(next));
00165     if (next == aRoot || ! next)
00166       return nsnull;
00167     aNode = next;
00168     
00169     aNode->GetNextSibling(getter_AddRefs(next));
00170     if (next)
00171       return next;
00172   }
00173 }
00174 
00175 // aNode is not a text node. Find the first text node starting at aNode/aOffset
00176 // in a preorder DOM traversal.
00177 static nsIDOMNode*
00178 FindNextTextNode(nsIDOMNode* aNode, PRInt32 aOffset, nsIDOMNode* aRoot)
00179 {
00180   NS_PRECONDITION(aNode, "Null starting node?");
00181   NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
00182 
00183   nsIDOMNode* checkNode;
00184   // Need to start at the aOffset'th child
00185   nsCOMPtr<nsIDOMNode> child;
00186   aNode->GetFirstChild(getter_AddRefs(child));
00187   while (child && aOffset > 0) {
00188     nsCOMPtr<nsIDOMNode> next;
00189     child->GetNextSibling(getter_AddRefs(next));
00190     child.swap(next);
00191     --aOffset;
00192   }
00193   if (child) {
00194     checkNode = child;
00195   } else {
00196     // aOffset was beyond the end of the child list. Start checking at the next
00197     // node after the last child, or aNode if there are no children.
00198     aNode->GetLastChild(getter_AddRefs(child));
00199     if (child) {
00200       checkNode = FindNextNode(child, aRoot);
00201     } else {
00202       checkNode = FindNextNode(aNode, aRoot);
00203     }
00204   }
00205   
00206   while (checkNode && !IsTextNode(checkNode)) {
00207     checkNode = FindNextNode(checkNode, aRoot);
00208   }
00209   return checkNode;
00210 }
00211 
00212 // mozInlineSpellWordUtil::SetEnd
00213 //
00214 //    We have two ranges "hard" and "soft". The hard boundary is simply
00215 //    the scope of the root node. The soft boundary is that which is set
00216 //    by the caller of this class by calling this function. If this function is
00217 //    not called, the soft boundary is the same as the hard boundary.
00218 //
00219 //    When we reach the soft boundary (mSoftEnd), we keep
00220 //    going until we reach the end of a word. This allows the caller to set the
00221 //    end of the range to anything, and we will always check whole multiples of
00222 //    words. When we reach the hard boundary we stop no matter what.
00223 //
00224 //    There is no beginning soft boundary. This is because we only go to the
00225 //    previous node once, when finding the previous word boundary in
00226 //    SetPosition(). You might think of the soft boundary as being this initial
00227 //    position.
00228 
00229 nsresult
00230 mozInlineSpellWordUtil::SetEnd(nsIDOMNode* aEndNode, PRInt32 aEndOffset)
00231 {
00232   NS_PRECONDITION(aEndNode, "Null end node?");
00233 
00234   NS_ASSERTION(mRootNode, "Not initialized");
00235 
00236   InvalidateWords();
00237 
00238   if (!IsTextNode(aEndNode)) {
00239     // End at the start of the first text node after aEndNode/aEndOffset.
00240     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
00241     aEndOffset = 0;
00242   }
00243   mSoftEnd = NodeOffset(aEndNode, aEndOffset);
00244   return NS_OK;
00245 }
00246 
00247 nsresult
00248 mozInlineSpellWordUtil::SetPosition(nsIDOMNode* aNode, PRInt32 aOffset)
00249 {
00250   InvalidateWords();
00251 
00252   if (!IsTextNode(aNode)) {
00253     // Start at the start of the first text node after aNode/aOffset.
00254     aNode = FindNextTextNode(aNode, aOffset, mRootNode);
00255     aOffset = 0;
00256   }
00257   mSoftBegin = NodeOffset(aNode, aOffset);
00258 
00259   EnsureWords();
00260   
00261   PRInt32 textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
00262   if (textOffset < 0)
00263     return NS_OK;
00264   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, PR_TRUE);
00265   return NS_OK;
00266 }
00267 
00268 void
00269 mozInlineSpellWordUtil::EnsureWords()
00270 {
00271   if (mSoftTextValid)
00272     return;
00273   BuildSoftText();
00274   BuildRealWords();
00275   mSoftTextValid = PR_TRUE;
00276 }
00277 
00278 nsresult
00279 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsIDOMRange** aRange)
00280 {
00281   NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
00282   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
00283   return MakeRange(begin, end, aRange);
00284 }
00285 
00286 // mozInlineSpellWordUtil::GetRangeForWord
00287 
00288 nsresult
00289 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
00290                                         PRInt32 aWordOffset,
00291                                         nsIDOMRange** aRange)
00292 {
00293   // Set our soft end and start
00294   NodeOffset pt = NodeOffset(aWordNode, aWordOffset);
00295   
00296   InvalidateWords();
00297   mSoftBegin = mSoftEnd = pt;
00298   EnsureWords();
00299   
00300   PRInt32 offset = MapDOMPositionToSoftTextOffset(pt);
00301   if (offset < 0)
00302     return MakeRange(pt, pt, aRange);
00303   PRInt32 wordIndex = FindRealWordContaining(offset, HINT_BEGIN, PR_FALSE);
00304   if (wordIndex < 0)
00305     return MakeRange(pt, pt, aRange);
00306   return MakeRangeForWord(mRealWords[wordIndex], aRange);
00307 }
00308 
00309 // This is to fix characters that the spellchecker may not like
00310 static void
00311 NormalizeWord(const nsSubstring& aInput, PRInt32 aPos, PRInt32 aLen, nsAString& aOutput)
00312 {
00313   aOutput.Truncate();
00314   for (PRInt32 i = 0; i < aLen; i++) {
00315     PRUnichar ch = aInput.CharAt(i + aPos);
00316 
00317     // remove ignorable characters from the word
00318     if (IsIgnorableCharacter(ch))
00319       continue;
00320 
00321     // the spellchecker doesn't handle curly apostrophes in all languages
00322     if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
00323       ch = '\'';
00324     }
00325 
00326     aOutput.Append(ch);
00327   }
00328 }
00329 
00330 // mozInlineSpellWordUtil::GetNextWord
00331 //
00332 //    FIXME-optimization: we shouldn't have to generate a range every single
00333 //    time. It would be better if the inline spellchecker didn't require a
00334 //    range unless the word was misspelled. This may or may not be possible.
00335 
00336 nsresult
00337 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsIDOMRange** aRange,
00338                                     PRBool* aSkipChecking)
00339 {
00340 #ifdef DEBUG_SPELLCHECK
00341   printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
00342 #endif
00343 
00344   if (mNextWordIndex < 0 ||
00345       mNextWordIndex >= PRInt32(mRealWords.Length())) {
00346     mNextWordIndex = -1;
00347     *aRange = nsnull;
00348     *aSkipChecking = PR_TRUE;
00349     return NS_OK;
00350   }
00351   
00352   const RealWord& word = mRealWords[mNextWordIndex];
00353   nsresult rv = MakeRangeForWord(word, aRange);
00354   NS_ENSURE_SUCCESS(rv, rv);
00355   ++mNextWordIndex;
00356   *aSkipChecking = !word.mCheckableWord;
00357   ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
00358 
00359 #ifdef DEBUG_SPELLCHECK
00360   printf("GetNextWord returning: %s (skip=%d)\n",
00361          NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
00362 #endif
00363   
00364   return NS_OK;
00365 }
00366 
00367 // mozInlineSpellWordUtil::MakeRange
00368 //
00369 //    Convenience function for creating a range over the current document.
00370 
00371 nsresult
00372 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
00373                                   nsIDOMRange** aRange)
00374 {
00375   if (! mDOMDocumentRange)
00376     return NS_ERROR_NOT_INITIALIZED;
00377 
00378   nsresult rv = mDOMDocumentRange->CreateRange(aRange);
00379   NS_ENSURE_SUCCESS(rv, rv);
00380 
00381   rv = (*aRange)->SetStart(aBegin.mNode, aBegin.mOffset);
00382   NS_ENSURE_SUCCESS(rv, rv);
00383   rv = (*aRange)->SetEnd(aEnd.mNode, aEnd.mOffset);
00384   NS_ENSURE_SUCCESS(rv, rv);
00385 
00386   return NS_OK;
00387 }
00388 
00389 /*********** DOM text extraction ************/
00390 
00391 // IsDOMWordSeparator
00392 //
00393 //    Determines if the given character should be considered as a DOM Word
00394 //    separator. Basically, this is whitespace, although it could also have
00395 //    certain punctuation that we know ALWAYS breaks words. This is important.
00396 //    For example, we can't have any punctuation that could appear in a URL
00397 //    or email address in this, because those need to always fit into a single
00398 //    DOM word.
00399 
00400 static PRBool
00401 IsDOMWordSeparator(PRUnichar ch)
00402 {
00403   // simple spaces
00404   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
00405     return PR_TRUE;
00406 
00407   // complex spaces - check only if char isn't ASCII (uncommon)
00408   if (ch >= 0xA0 &&
00409       (ch == 0x00A0 ||  // NO-BREAK SPACE
00410        ch == 0x2002 ||  // EN SPACE
00411        ch == 0x2003 ||  // EM SPACE
00412        ch == 0x2009 ||  // THIN SPACE
00413        ch == 0x200C ||  // ZERO WIDTH NON-JOINER
00414        ch == 0x3000))   // IDEOGRAPHIC SPACE
00415     return PR_TRUE;
00416 
00417   // otherwise not a space
00418   return PR_FALSE;
00419 }
00420 
00421 static PRBool
00422 IsBRElement(nsIDOMNode* aNode)
00423 {
00424   nsresult rv;
00425   nsCOMPtr<nsIDOMHTMLBRElement> elt = do_QueryInterface(aNode, &rv);
00426   return NS_SUCCEEDED(rv);
00427 }
00428 
00429 static void
00430 GetNodeText(nsIDOMNode* aNode, nsAutoString& aText)
00431 {
00432   nsresult rv = aNode->GetNodeValue(aText);
00433   NS_ASSERTION(NS_SUCCEEDED(rv), "Unable to get node text");
00434 }
00435 
00436 // Find the previous node in the DOM tree in preorder. This isn't fast because
00437 // one call to GetPrevSibling can be O(N) in the number of siblings...
00438 static nsIDOMNode*
00439 FindPrevNode(nsIDOMNode* aNode, nsIDOMNode* aRoot)
00440 {
00441   if (aNode == aRoot)
00442     return nsnull;
00443   
00444   nsCOMPtr<nsIDOMNode> prev;
00445   aNode->GetPreviousSibling(getter_AddRefs(prev));
00446   if (prev) {
00447     for (;;) {
00448       nsCOMPtr<nsIDOMNode> lastChild;
00449       prev->GetLastChild(getter_AddRefs(lastChild));
00450       if (!lastChild)
00451         return prev;
00452       prev = lastChild;
00453     }
00454   }
00455 
00456   // No prev sibling. So we are the first child of our parent, if any. Our
00457   // parent is our previous node.
00458   aNode->GetParentNode(getter_AddRefs(prev));
00459   return prev;
00460 }
00461 
00468 static PRBool
00469 ContainsDOMWordSeparator(nsIDOMNode* aNode, PRInt32 aBeforeOffset,
00470                          PRInt32* aSeparatorOffset)
00471 {
00472   if (IsBRElement(aNode)) {
00473     *aSeparatorOffset = 0;
00474     return PR_TRUE;
00475   }
00476   
00477   if (!IsTextNode(aNode))
00478     return PR_FALSE;
00479 
00480   nsAutoString str;
00481   GetNodeText(aNode, str);
00482   for (PRInt32 i = PR_MIN(aBeforeOffset, PRInt32(str.Length())) - 1; i >= 0; --i) {
00483     if (IsDOMWordSeparator(str.CharAt(i))) {
00484       *aSeparatorOffset = i;
00485       return PR_TRUE;
00486     }
00487   }
00488   return PR_FALSE;
00489 }
00490 
00491 static PRBool
00492 IsBreakElement(nsIDOMViewCSS* aDocView, nsIDOMNode* aNode)
00493 {
00494   nsCOMPtr<nsIDOMElement> element = do_QueryInterface(aNode);
00495   if (!element)
00496     return PR_FALSE;
00497     
00498   if (IsBRElement(aNode))
00499     return PR_TRUE;
00500   
00501   nsCOMPtr<nsIDOMCSSStyleDeclaration> style;
00502   aDocView->GetComputedStyle(element, EmptyString(), getter_AddRefs(style));
00503   if (!style)
00504     return PR_FALSE;
00505 
00506 #ifdef DEBUG_SPELLCHECK
00507   printf("    searching element %p\n", (void*)aNode);
00508 #endif
00509 
00510   nsAutoString display;
00511   style->GetPropertyValue(NS_LITERAL_STRING("display"), display);
00512 #ifdef DEBUG_SPELLCHECK
00513   printf("      display=\"%s\"\n", NS_ConvertUTF16toUTF8(display).get());
00514 #endif
00515   if (!display.EqualsLiteral("inline"))
00516     return PR_TRUE;
00517 
00518   nsAutoString position;
00519   style->GetPropertyValue(NS_LITERAL_STRING("position"), position);
00520 #ifdef DEBUG_SPELLCHECK
00521   printf("      position=%s\n", NS_ConvertUTF16toUTF8(position).get());
00522 #endif
00523   if (!position.EqualsLiteral("static"))
00524     return PR_TRUE;
00525     
00526   // XXX What about floats? What else?
00527   return PR_FALSE;
00528 }
00529 
00530 struct CheckLeavingBreakElementClosure {
00531   nsIDOMViewCSS* mDocView;
00532   PRPackedBool   mLeftBreakElement;
00533 };
00534 
00535 static void
00536 CheckLeavingBreakElement(nsIDOMNode* aNode, void* aClosure)
00537 {
00538   CheckLeavingBreakElementClosure* cl =
00539     NS_STATIC_CAST(CheckLeavingBreakElementClosure*, aClosure);
00540   if (!cl->mLeftBreakElement && IsBreakElement(cl->mDocView, aNode)) {
00541     cl->mLeftBreakElement = PR_TRUE;
00542   }
00543 }
00544 
00545 void
00546 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
00547 {
00548   nsAutoString result;
00549   ::NormalizeWord(aWord, 0, aWord.Length(), result);
00550   aWord = result;
00551 }
00552 
00553 void
00554 mozInlineSpellWordUtil::BuildSoftText()
00555 {
00556   // First we have to work backwards from mSoftStart to find a text node
00557   // containing a DOM word separator, a non-inline-element
00558   // boundary, or the hard start node. That's where we'll start building the
00559   // soft string from.
00560   nsIDOMNode* node = mSoftBegin.mNode;
00561   PRInt32 firstOffsetInNode = 0;
00562   PRInt32 checkBeforeOffset = mSoftBegin.mOffset;
00563   while (node) {
00564     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode))
00565       break;
00566     checkBeforeOffset = PR_INT32_MAX;
00567     if (IsBreakElement(mCSSView, node)) {
00568       // Since FindPrevNode follows tree *preorder*, we're about to traverse
00569       // up out of 'node'. Since node induces breaks (e.g., it's a block),
00570       // don't bother trying to look outside it, just stop now.
00571       break;
00572     }
00573     node = FindPrevNode(node, mRootNode);
00574   }
00575 
00576   // Now build up the string moving forward through the DOM until we reach
00577   // the soft end and *then* see a DOM word separator, a non-inline-element
00578   // boundary, or the hard end node.
00579   mSoftText.Truncate();
00580   mSoftTextDOMMapping.Clear();
00581   PRBool seenSoftEnd = PR_FALSE;
00582   // Leave this outside the loop so large heap string allocations can be reused
00583   // across iterations
00584   nsAutoString str;
00585   while (node) {
00586     if (node == mSoftEnd.mNode) {
00587       seenSoftEnd = PR_TRUE;
00588     }
00589 
00590     PRBool exit = PR_FALSE;
00591     if (IsTextNode(node)) {
00592       GetNodeText(node, str);
00593       PRInt32 lastOffsetInNode = str.Length();
00594 
00595       if (seenSoftEnd) {
00596         // check whether we can stop after this
00597         for (PRInt32 i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
00598              i < PRInt32(str.Length()); ++i) {
00599           if (IsDOMWordSeparator(str.CharAt(i))) {
00600             exit = PR_TRUE;
00601             // stop at the first separator after the soft end point
00602             lastOffsetInNode = i;
00603             break;
00604           }
00605         }
00606       }
00607       
00608       if (firstOffsetInNode < lastOffsetInNode) {
00609         PRInt32 len = lastOffsetInNode - firstOffsetInNode;
00610         mSoftTextDOMMapping.AppendElement(
00611           DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
00612         mSoftText.Append(Substring(str, firstOffsetInNode, len));
00613       }
00614       
00615       firstOffsetInNode = 0;
00616     }
00617 
00618     if (exit)
00619       break;
00620 
00621     CheckLeavingBreakElementClosure closure = { mCSSView, PR_FALSE };
00622     node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
00623     if (closure.mLeftBreakElement || (node && IsBreakElement(mCSSView, node))) {
00624       // We left, or are entering, a break element (e.g., block). Maybe we can
00625       // stop now.
00626       if (seenSoftEnd)
00627         break;
00628       // Record the break
00629       mSoftText.Append(' ');
00630     }
00631   }
00632   
00633 #ifdef DEBUG_SPELLCHECK
00634   printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
00635 #endif
00636 }
00637 
00638 void
00639 mozInlineSpellWordUtil::BuildRealWords()
00640 {
00641   // This is pretty simple. We just have to walk mSoftText, tokenizing it
00642   // into "real words".
00643   // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
00644   // SplitDOMWord on each of those DOM words
00645   PRInt32 wordStart = -1;
00646   mRealWords.Clear();
00647   for (PRInt32 i = 0; i < PRInt32(mSoftText.Length()); ++i) {
00648     if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
00649       if (wordStart >= 0) {
00650         SplitDOMWord(wordStart, i);
00651         wordStart = -1;
00652       }
00653     } else {
00654       if (wordStart < 0) {
00655         wordStart = i;
00656       }
00657     }
00658   }
00659   if (wordStart >= 0) {
00660     SplitDOMWord(wordStart, mSoftText.Length());
00661   }
00662 }
00663 
00664 /*********** DOM/realwords<->mSoftText mapping functions ************/
00665 
00666 PRInt32
00667 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
00668 {
00669   if (!mSoftTextValid) {
00670     NS_ERROR("Soft text must be valid if we're to map into it");
00671     return -1;
00672   }
00673   
00674   for (PRInt32 i = 0; i < PRInt32(mSoftTextDOMMapping.Length()); ++i) {
00675     const DOMTextMapping& map = mSoftTextDOMMapping[i];
00676     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
00677       // Allow offsets at either end of the string, in particular, allow the
00678       // offset that's at the end of the contributed string
00679       PRInt32 offsetInContributedString =
00680         aNodeOffset.mOffset - map.mNodeOffset.mOffset;
00681       if (offsetInContributedString >= 0 &&
00682           offsetInContributedString <= map.mLength)
00683         return map.mSoftTextOffset + offsetInContributedString;
00684       return -1;
00685     }
00686   }
00687   return -1;
00688 }
00689 
00690 mozInlineSpellWordUtil::NodeOffset
00691 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset,
00692                                                        DOMMapHint aHint)
00693 {
00694   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
00695   if (!mSoftTextValid)
00696     return NodeOffset(nsnull, -1);
00697   
00698   // The invariant is that the range start..end includes the last mapping,
00699   // if any, such that mSoftTextOffset <= aSoftTextOffset
00700   PRInt32 start = 0;
00701   PRInt32 end = mSoftTextDOMMapping.Length();
00702   while (end - start >= 2) {
00703     PRInt32 mid = (start + end)/2;
00704     const DOMTextMapping& map = mSoftTextDOMMapping[mid];
00705     if (map.mSoftTextOffset > aSoftTextOffset) {
00706       end = mid;
00707     } else {
00708       start = mid;
00709     }
00710   }
00711   
00712   if (start >= end)
00713     return NodeOffset(nsnull, -1);
00714 
00715   // 'start' is now the last mapping, if any, such that
00716   // mSoftTextOffset <= aSoftTextOffset.
00717   // If we're doing HINT_END, then we may want to return the end of the
00718   // the previous mapping instead of the start of this mapping
00719   if (aHint == HINT_END && start > 0) {
00720     const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
00721     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
00722       return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
00723   }
00724   
00725   // We allow ourselves to return the end of this mapping even if we're
00726   // doing HINT_START. This will only happen if there is no mapping which this
00727   // point is the start of. I'm not 100% sure this is OK...
00728   const DOMTextMapping& map = mSoftTextDOMMapping[start];
00729   PRInt32 offset = aSoftTextOffset - map.mSoftTextOffset;
00730   if (offset >= 0 && offset <= map.mLength)
00731     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
00732     
00733   return NodeOffset(nsnull, -1);
00734 }
00735 
00736 PRInt32
00737 mozInlineSpellWordUtil::FindRealWordContaining(PRInt32 aSoftTextOffset,
00738     DOMMapHint aHint, PRBool aSearchForward)
00739 {
00740   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
00741   if (!mSoftTextValid)
00742     return -1;
00743 
00744   // The invariant is that the range start..end includes the last word,
00745   // if any, such that mSoftTextOffset <= aSoftTextOffset
00746   PRInt32 start = 0;
00747   PRInt32 end = mRealWords.Length();
00748   while (end - start >= 2) {
00749     PRInt32 mid = (start + end)/2;
00750     const RealWord& word = mRealWords[mid];
00751     if (word.mSoftTextOffset > aSoftTextOffset) {
00752       end = mid;
00753     } else {
00754       start = mid;
00755     }
00756   }
00757   
00758   if (start >= end)
00759     return -1;
00760 
00761   // 'start' is now the last word, if any, such that
00762   // mSoftTextOffset <= aSoftTextOffset.
00763   // If we're doing HINT_END, then we may want to return the end of the
00764   // the previous word instead of the start of this word
00765   if (aHint == HINT_END && start > 0) {
00766     const RealWord& word = mRealWords[start - 1];
00767     if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
00768       return start - 1;
00769   }
00770   
00771   // We allow ourselves to return the end of this word even if we're
00772   // doing HINT_START. This will only happen if there is no word which this
00773   // point is the start of. I'm not 100% sure this is OK...
00774   const RealWord& word = mRealWords[start];
00775   PRInt32 offset = aSoftTextOffset - word.mSoftTextOffset;
00776   if (offset >= 0 && offset <= word.mLength)
00777     return start;
00778 
00779   if (aSearchForward) {
00780     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
00781       // All words have mSoftTextOffset > aSoftTextOffset
00782       return 0;
00783     }
00784     // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
00785     // Word start+1, if it exists, will be the first with
00786     // mSoftTextOffset > aSoftTextOffset.
00787     if (start + 1 < PRInt32(mRealWords.Length()))
00788       return start + 1;
00789   }
00790 
00791   return -1;
00792 }
00793 
00794 /*********** Word Splitting ************/
00795 
00796 // classifies a given character in the DOM word
00797 enum CharClass {
00798   CHAR_CLASS_WORD,
00799   CHAR_CLASS_SEPARATOR,
00800   CHAR_CLASS_END_OF_INPUT };
00801 
00802 // Encapsulates DOM-word to real-word splitting
00803 struct WordSplitState
00804 {
00805   mozInlineSpellWordUtil*    mWordUtil;
00806   const nsDependentSubstring mDOMWordText;
00807   PRInt32                    mDOMWordOffset;
00808   CharClass                  mCurCharClass;
00809 
00810   WordSplitState(mozInlineSpellWordUtil* aWordUtil,
00811                  const nsString& aString, PRInt32 aStart, PRInt32 aLen)
00812     : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
00813       mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
00814 
00815   CharClass ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const;
00816   void Advance();
00817   void AdvanceThroughSeparators();
00818   void AdvanceThroughWord();
00819 
00820   // Finds special words like email addresses and URLs that may start at the
00821   // current position, and returns their length, or 0 if not found. This allows
00822   // arbitrary word breaking rules to be used for these special entities, as
00823   // long as they can not contain whitespace.
00824   PRInt32 FindSpecialWord();
00825 
00826   // Similar to FindSpecialWord except that this takes a split word as
00827   // input. This checks for things that do not require special word-breaking
00828   // rules.
00829   PRBool ShouldSkipWord(PRInt32 aStart, PRInt32 aLength);
00830 };
00831 
00832 // WordSplitState::ClassifyCharacter
00833 
00834 CharClass
00835 WordSplitState::ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const
00836 {
00837   NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()),
00838                "Index out of range");
00839   if (aIndex == PRInt32(mDOMWordText.Length()))
00840     return CHAR_CLASS_SEPARATOR;
00841 
00842   // this will classify the character, we want to treat "ignorable" characters
00843   // such as soft hyphens as word characters.
00844   PRInt32 charCategory = GetCat(mDOMWordText[aIndex]);
00845   if (charCategory == CHAR_CAT_WORD ||
00846       IsIgnorableCharacter(mDOMWordText[aIndex]))
00847     return CHAR_CLASS_WORD;
00848 
00849   // If conditional punctuation is surrounded immediately on both sides by word
00850   // characters it also counts as a word character.
00851   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
00852     if (!aRecurse) {
00853       // not allowed to look around, this punctuation counts like a separator
00854       return CHAR_CLASS_SEPARATOR;
00855     }
00856 
00857     // check the left-hand character
00858     if (aIndex == 0)
00859       return CHAR_CLASS_SEPARATOR;
00860     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
00861       return CHAR_CLASS_SEPARATOR;
00862 
00863     // now we know left char is a word-char, check the right-hand character
00864     if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
00865       return CHAR_CLASS_SEPARATOR;
00866     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
00867       return CHAR_CLASS_SEPARATOR;
00868 
00869     // char on either side is a word, this counts as a word
00870     return CHAR_CLASS_WORD;
00871   }
00872 
00873   // all other punctuation
00874   if (charCategory == CHAR_CAT_SPACE ||
00875       charCategory == CHAR_CAT_CONTROL ||
00876       charCategory == CHAR_CAT_PUNCTUATION1 ||
00877       charCategory == CHAR_CAT_PUNCTUATION2)
00878     return CHAR_CLASS_SEPARATOR;
00879 
00880   // any other character counts as a word
00881   return CHAR_CLASS_WORD;
00882 }
00883 
00884 
00885 // WordSplitState::Advance
00886 
00887 void
00888 WordSplitState::Advance()
00889 {
00890   NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
00891   NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(),
00892                "Length beyond end");
00893 
00894   mDOMWordOffset ++;
00895   if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length())
00896     mCurCharClass = CHAR_CLASS_END_OF_INPUT;
00897   else
00898     mCurCharClass = ClassifyCharacter(mDOMWordOffset, PR_TRUE);
00899 }
00900 
00901 
00902 // WordSplitState::AdvanceThroughSeparators
00903 
00904 void
00905 WordSplitState::AdvanceThroughSeparators()
00906 {
00907   while (mCurCharClass == CHAR_CLASS_SEPARATOR)
00908     Advance();
00909 }
00910 
00911 // WordSplitState::AdvanceThroughWord
00912 
00913 void
00914 WordSplitState::AdvanceThroughWord()
00915 {
00916   while (mCurCharClass == CHAR_CLASS_WORD)
00917     Advance();
00918 }
00919 
00920 
00921 // WordSplitState::FindSpecialWord
00922 
00923 PRInt32
00924 WordSplitState::FindSpecialWord()
00925 {
00926   PRInt32 i;
00927 
00928   // Search for email addresses. We simply define these as any sequence of
00929   // characters with an '@' character in the middle. The DOM word is already
00930   // split on whitepace, so we know that everything to the end is the address
00931   //
00932   // Also look for periods, this tells us if we want to run the URL finder.
00933   PRBool foundDot = PR_FALSE;
00934   PRInt32 firstColon = -1;
00935   for (i = mDOMWordOffset;
00936        i < PRInt32(mDOMWordText.Length()); i ++) {
00937     if (mDOMWordText[i] == '@') {
00938       // only accept this if there are unambigous word characters (don't bother
00939       // recursing to disambiguate apostrophes) on each side. This prevents
00940       // classifying, e.g. "@home" as an email address
00941 
00942       // Use this condition to only accept words with '@' in the middle of
00943       // them. It works, but the inlinespellcker doesn't like this. The problem
00944       // is that you type "fhsgfh@" that's a misspelled word followed by a
00945       // symbol, but when you type another letter "fhsgfh@g" that first word
00946       // need to be unmarked misspelled. It doesn't do this. it only checks the
00947       // current position for potentially removing a spelling range.
00948       if (i > 0 && ClassifyCharacter(i - 1, PR_FALSE) == CHAR_CLASS_WORD &&
00949           i < (PRInt32)mDOMWordText.Length() - 1 &&
00950           ClassifyCharacter(i + 1, PR_FALSE) == CHAR_CLASS_WORD)
00951 
00952       return mDOMWordText.Length() - mDOMWordOffset;
00953     } else if (mDOMWordText[i] == '.' && ! foundDot &&
00954         i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) {
00955       // we found a period not at the end, we should check harder for URLs
00956       foundDot = PR_TRUE;
00957     } else if (mDOMWordText[i] == ':' && firstColon < 0) {
00958       firstColon = i;
00959     }
00960   }
00961 
00962   // If the first colon is followed by a slash, consider it a URL
00963   // This will catch things like asdf://foo.com
00964   if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 &&
00965       mDOMWordText[firstColon + 1] == '/') {
00966     return mDOMWordText.Length() - mDOMWordOffset;
00967   }
00968 
00969   // Check the text before the first colon against some known protocols. It
00970   // is impossible to check against all protocols, especially since you can
00971   // plug in new protocols. We also don't want to waste time here checking
00972   // against a lot of obscure protocols.
00973   if (firstColon > mDOMWordOffset) {
00974     nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
00975                       firstColon - mDOMWordOffset));
00976     if (protocol.EqualsIgnoreCase("http") ||
00977         protocol.EqualsIgnoreCase("https") ||
00978         protocol.EqualsIgnoreCase("news") ||
00979         protocol.EqualsIgnoreCase("ftp") ||
00980         protocol.EqualsIgnoreCase("file") ||
00981         protocol.EqualsIgnoreCase("javascript") ||
00982         protocol.EqualsIgnoreCase("ftp")) {
00983       return mDOMWordText.Length() - mDOMWordOffset;
00984     }
00985   }
00986 
00987   // not anything special
00988   return -1;
00989 }
00990 
00991 // WordSplitState::ShouldSkipWord
00992 
00993 PRBool
00994 WordSplitState::ShouldSkipWord(PRInt32 aStart, PRInt32 aLength)
00995 {
00996   PRInt32 last = aStart + aLength;
00997 
00998   // check to see if the word contains a digit
00999   for (PRInt32 i = aStart; i < last; i ++) {
01000     PRUnichar ch = mDOMWordText[i];
01001     // XXX Shouldn't this be something a lot more complex, Unicode-based?
01002     if (ch >= '0' && ch <= '9')
01003       return PR_TRUE;
01004   }
01005 
01006   // not special
01007   return PR_FALSE;
01008 }
01009 
01010 // mozInlineSpellWordUtil::SplitDOMWord
01011 
01012 void
01013 mozInlineSpellWordUtil::SplitDOMWord(PRInt32 aStart, PRInt32 aEnd)
01014 {
01015   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
01016   state.mCurCharClass = state.ClassifyCharacter(0, PR_TRUE);
01017 
01018   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
01019     state.AdvanceThroughSeparators();
01020     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
01021       break;
01022 
01023     PRInt32 specialWordLength = state.FindSpecialWord();
01024     if (specialWordLength > 0) {
01025       mRealWords.AppendElement(
01026         RealWord(aStart + state.mDOMWordOffset, specialWordLength, PR_FALSE));
01027 
01028       // skip the special word
01029       state.mDOMWordOffset += specialWordLength;
01030       if (state.mDOMWordOffset + aStart >= aEnd)
01031         state.mCurCharClass = CHAR_CLASS_END_OF_INPUT;
01032       else
01033         state.mCurCharClass = state.ClassifyCharacter(state.mDOMWordOffset, PR_TRUE);
01034       continue;
01035     }
01036 
01037     // save the beginning of the word
01038     PRInt32 wordOffset = state.mDOMWordOffset;
01039 
01040     // find the end of the word
01041     state.AdvanceThroughWord();
01042     PRInt32 wordLen = state.mDOMWordOffset - wordOffset;
01043     mRealWords.AppendElement(
01044       RealWord(aStart + wordOffset, wordLen,
01045                !state.ShouldSkipWord(wordOffset, wordLen)));
01046   }
01047 }