Back to index

lightning-sunbird  0.9+nobinonly
nsHTMLTokenizer.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* vim: set sw=2 ts=2 et tw=78: */
00003 /* ***** BEGIN LICENSE BLOCK *****
00004  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00005  *
00006  * The contents of this file are subject to the Mozilla Public License Version
00007  * 1.1 (the "License"); you may not use this file except in compliance with
00008  * the License. You may obtain a copy of the License at
00009  * http://www.mozilla.org/MPL/
00010  *
00011  * Software distributed under the License is distributed on an "AS IS" basis,
00012  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00013  * for the specific language governing rights and limitations under the
00014  * License.
00015  *
00016  * The Original Code is mozilla.org code.
00017  *
00018  * The Initial Developer of the Original Code is
00019  * Netscape Communications Corporation.
00020  * Portions created by the Initial Developer are Copyright (C) 1998
00021  * the Initial Developer. All Rights Reserved.
00022  *
00023  * Contributor(s):
00024  *   Blake Kaplan <mrbkap@gmail.com>
00025  *
00026  * Alternatively, the contents of this file may be used under the terms of
00027  * either of the GNU General Public License Version 2 or later (the "GPL"),
00028  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00029  * in which case the provisions of the GPL or the LGPL are applicable instead
00030  * of those above. If you wish to allow use of your version of this file only
00031  * under the terms of either the GPL or the LGPL, and not to allow others to
00032  * use your version of this file under the terms of the MPL, indicate your
00033  * decision by deleting the provisions above and replace them with the notice
00034  * and other provisions required by the GPL or the LGPL. If you do not delete
00035  * the provisions above, a recipient may use your version of this file under
00036  * the terms of any one of the MPL, the GPL or the LGPL.
00037  *
00038  * ***** END LICENSE BLOCK ***** */
00039 
00040 
00050 #include "nsIAtom.h"
00051 #include "nsHTMLTokenizer.h"
00052 #include "nsScanner.h"
00053 #include "nsElementTable.h"
00054 #include "CParserContext.h"
00055 #include "nsReadableUtils.h"
00056 #include "nsUnicharUtils.h"
00057 
00058 /************************************************************************
00059   And now for the main class -- nsHTMLTokenizer...
00060  ************************************************************************/
00061 
00062 static NS_DEFINE_IID(kISupportsIID,   NS_ISUPPORTS_IID);                 
00063 static NS_DEFINE_IID(kITokenizerIID,  NS_ITOKENIZER_IID);
00064 static NS_DEFINE_IID(kClassIID,       NS_HTMLTOKENIZER_IID); 
00065 
00075 nsresult nsHTMLTokenizer::QueryInterface(const nsIID& aIID, void** aInstancePtr)  
00076 {                                                                        
00077   if (NULL == aInstancePtr) {                                            
00078     return NS_ERROR_NULL_POINTER;                                        
00079   }                                                                      
00080 
00081   if(aIID.Equals(kISupportsIID))    {  // Do IUnknown...
00082     *aInstancePtr = (nsISupports*)(this);                                        
00083   }
00084   else if(aIID.Equals(kITokenizerIID)) {  // Do ITokenizer base class...
00085     *aInstancePtr = (nsITokenizer*)(this);                                        
00086   }
00087   else if(aIID.Equals(kClassIID)) {  // Do this class...
00088     *aInstancePtr = (nsHTMLTokenizer*)(this);                                        
00089   }                 
00090   else {
00091     *aInstancePtr=0;
00092     return NS_NOINTERFACE;
00093   }
00094   NS_ADDREF_THIS();
00095   return NS_OK;                                                        
00096 }
00097 
00109 nsresult NS_NewHTMLTokenizer(nsITokenizer** aInstancePtrResult,
00110                              PRInt32 aFlag,
00111                              eParserDocType aDocType, 
00112                              eParserCommands aCommand,
00113                              PRInt32 aFlags) 
00114 {
00115   NS_PRECONDITION(nsnull != aInstancePtrResult, "null ptr");
00116   if (nsnull == aInstancePtrResult) {
00117     return NS_ERROR_NULL_POINTER;
00118   }
00119   nsHTMLTokenizer* it = new nsHTMLTokenizer(aFlag,aDocType,aCommand,aFlags);
00120   if (nsnull == it) {
00121     return NS_ERROR_OUT_OF_MEMORY;
00122   }
00123   return it->QueryInterface(kClassIID, (void **) aInstancePtrResult);
00124 }
00125 
00126 
00127 NS_IMPL_ADDREF(nsHTMLTokenizer)
00128 NS_IMPL_RELEASE(nsHTMLTokenizer)
00129 
00130 
00138 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
00139                                  eParserDocType aDocType,
00140                                  eParserCommands aCommand,
00141                                  PRUint16 aFlags) :
00142   nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
00143 {
00144   if (aParseMode==eDTDMode_full_standards ||
00145       aParseMode==eDTDMode_almost_standards) {
00146     mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
00147   }
00148   else if (aParseMode==eDTDMode_quirks)  {
00149     mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
00150   }
00151   else if (aParseMode==eDTDMode_autodetect) {
00152     mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
00153   }
00154   else {
00155     mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
00156   }
00157 
00158   if (aDocType==ePlainText) {
00159     mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
00160   }
00161   else if (aDocType==eXML) {
00162     mFlags |= NS_IPARSER_FLAG_XML;
00163   }
00164   else if (aDocType==eHTML_Quirks ||
00165            aDocType==eHTML3_Quirks ||
00166            aDocType==eHTML_Strict) {
00167     mFlags |= NS_IPARSER_FLAG_HTML;
00168   }
00169   
00170   mFlags |= (aCommand==eViewSource)? NS_IPARSER_FLAG_VIEW_SOURCE:NS_IPARSER_FLAG_VIEW_NORMAL;
00171 
00172   NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) || 
00173                 (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
00174               "Why isn't this XML document going through our XML parser?");
00175 
00176   mTokenAllocator = nsnull;
00177   mTokenScanPos = 0;
00178 }
00179 
00180 
00184 nsHTMLTokenizer::~nsHTMLTokenizer()
00185 {
00186   if(mTokenDeque.GetSize()){
00187     CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
00188     mTokenDeque.ForEach(theDeallocator);
00189   }
00190 }
00191  
00192 
00193 /*******************************************************************
00194   Here begins the real working methods for the tokenizer.
00195  *******************************************************************/
00196 
00208 /* static */
00209 void nsHTMLTokenizer::AddToken(CToken*& aToken,
00210                                nsresult aResult,
00211                                nsDeque* aDeque,
00212                                nsTokenAllocator* aTokenAllocator)
00213 {
00214   if(aToken && aDeque) {
00215     if(NS_SUCCEEDED(aResult)) {
00216       aDeque->Push(aToken);
00217     }
00218     else {
00219       IF_FREE(aToken, aTokenAllocator);
00220     }
00221   }
00222 }
00223 
00229 nsTokenAllocator* nsHTMLTokenizer::GetTokenAllocator(void)
00230 {
00231   return mTokenAllocator;
00232 }
00233 
00234 
00241 CToken* nsHTMLTokenizer::PeekToken()
00242 {
00243   return (CToken*)mTokenDeque.PeekFront();
00244 }
00245 
00246 
00253 CToken* nsHTMLTokenizer::PopToken()
00254 {
00255   CToken* result=nsnull;
00256   result=(CToken*)mTokenDeque.PopFront();
00257   return result;
00258 }
00259 
00260 
00268 CToken* nsHTMLTokenizer::PushTokenFront(CToken* theToken)
00269 {
00270   mTokenDeque.PushFront(theToken);
00271   return theToken;
00272 }
00273 
00280 CToken* nsHTMLTokenizer::PushToken(CToken* theToken)
00281 {
00282   mTokenDeque.Push(theToken);
00283   return theToken;
00284 }
00285 
00291 PRInt32 nsHTMLTokenizer::GetCount(void)
00292 {
00293   return mTokenDeque.GetSize();
00294 }
00295 
00304 CToken* nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
00305 {
00306   return (CToken*)mTokenDeque.ObjectAt(anIndex);
00307 }
00308 
00318 nsresult nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
00319                                        nsTokenAllocator* aTokenAllocator)
00320 {
00321   mTokenAllocator=aTokenAllocator;
00322   mIsFinalChunk=aIsFinalChunk;
00323   // Cause ScanDocStructure to search from here for new tokens...
00324   mTokenScanPos=mTokenDeque.GetSize();
00325   return NS_OK;
00326 }
00327 
00334 void nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
00335 {
00336   PRInt32 aCount=aDeque.GetSize();
00337   
00338   PRInt32 anIndex=0;
00339   for(anIndex=0;anIndex<aCount;++anIndex){
00340     CToken* theToken=(CToken*)aDeque.Pop();
00341     PushTokenFront(theToken);
00342   }
00343 
00344 }
00345 
00354 nsresult nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
00355 {
00356   if (aTokenizer) {
00357     mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
00358   }
00359 
00360   return NS_OK;
00361 }
00362 
00372 static PRInt32 FindLastIndexOfTag(eHTMLTags aTag,nsDeque &aTagStack)
00373 {
00374   PRInt32 theCount=aTagStack.GetSize();
00375   
00376   while(0<theCount) {
00377     CHTMLToken *theToken=(CHTMLToken*)aTagStack.ObjectAt(--theCount);  
00378     if(theToken) {
00379       eHTMLTags  theTag=(eHTMLTags)theToken->GetTypeID();
00380       if(theTag==aTag) {
00381         return theCount;
00382       }
00383     }
00384   }
00385 
00386   return kNotFound;
00387 }
00388 
00398 nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
00399 {
00400   nsresult result = NS_OK;
00401   if (!mTokenDeque.GetSize())
00402     return result;
00403 
00404   CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
00405 
00406   // Start by finding the first start tag that hasn't been reviewed.
00407   while(mTokenScanPos > 0) {
00408     if(theToken) {
00409       eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());  
00410       if(eToken_start == theType) {
00411         if(eFormUnknown == theToken->GetContainerInfo()) {
00412           break;
00413         }
00414       }
00415     }
00416     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
00417   }
00418 
00419   // Now that we know where to start, let's walk through the
00420   // tokens to see which are well-formed. Stop when you run out
00421   // of fresh tokens.
00422 
00423   nsDeque       theStack(0);
00424   nsDeque       tempStack(0);
00425   PRInt32       theStackDepth = 0;
00426   // Don't bother if we get ridiculously deep.
00427   static  const PRInt32 theMaxStackDepth = 200;
00428 
00429   while(theToken && theStackDepth < theMaxStackDepth) {
00430     eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
00431     eHTMLTags       theTag  = (eHTMLTags)theToken->GetTypeID();
00432 
00433     if(nsHTMLElement::IsContainer(theTag)) { // Bug 54117
00434       PRBool theTagIsBlock  = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
00435       PRBool theTagIsInline = (theTagIsBlock) ?
00436                                 PR_FALSE :
00437                                 gHTMLElements[theTag].IsMemberOf(kInlineEntity);
00438 
00439       if(theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
00440         switch(theType) {
00441           case eToken_start:
00442             {
00443               if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
00444                 PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
00445                 if (earlyPos != kNotFound) {
00446                   // Uh-oh, we've found a tag that is not allowed to nest at
00447                   // all. Mark the previous one and all of its children as 
00448                   // malformed to increase our chances of doing RS handling
00449                   // on all of them. We want to do this for cases such as:
00450                   // <a><div><a></a></div></a>.
00451                   // Note that we have to iterate through all of the chilren
00452                   // of the original malformed tag to protect against:
00453                   // <a><font><div><a></a></div></font></a>, so that the <font>
00454                   // is allowed to contain the <div>.
00455                   // XXX What about <a><span><a>, where the second <a> closes
00456                   // the <span>?
00457                   nsDequeIterator it(theStack, earlyPos), end(theStack.End());
00458                   while (it < end) {
00459                     CHTMLToken *theMalformedToken = 
00460                         NS_STATIC_CAST(CHTMLToken*, it++);
00461                   
00462                     theMalformedToken->SetContainerInfo(eMalformed);
00463                   }
00464                 }
00465               }
00466 
00467               theStack.Push(theToken);
00468               ++theStackDepth;
00469             }
00470             break;
00471           case eToken_end: 
00472             {
00473               CHTMLToken *theLastToken = NS_STATIC_CAST(CHTMLToken*, theStack.Peek());
00474               if(theLastToken) {
00475                 if(theTag == theLastToken->GetTypeID()) {
00476                   theStack.Pop(); // Yank it for real 
00477                   theStackDepth--;
00478                   theLastToken->SetContainerInfo(eWellFormed);
00479                 }
00480                 else {
00481                   // This token wasn't what we expected it to be! We need to
00482                   // go searching for its real start tag on our stack. Each
00483                   // tag in between the end tag and start tag must be malformed
00484 
00485                   if(FindLastIndexOfTag(theTag, theStack) != kNotFound) {
00486                     // Find theTarget in the stack, marking each (malformed!)
00487                     // tag in our way.
00488                     theStack.Pop(); // pop off theLastToken for real.
00489                     do {
00490                       theLastToken->SetContainerInfo(eMalformed);
00491                       tempStack.Push(theLastToken);
00492                       theLastToken = NS_STATIC_CAST(CHTMLToken*, theStack.Pop());
00493                     } while(theLastToken && theTag != theLastToken->GetTypeID());
00494                     // XXX The above test can confuse two different userdefined 
00495                     // tags.
00496 
00497                     NS_ASSERTION(theLastToken,
00498                                  "FindLastIndexOfTag lied to us!"
00499                                  " We couldn't find theTag on theStack");
00500                     theLastToken->SetContainerInfo(eMalformed);
00501 
00502                     // Great, now push all of the other tokens back onto the
00503                     // stack to preserve the general structure of the document.
00504                     // Note that we don't push the target token back onto the
00505                     // the stack (since it was just closed).
00506                     while(tempStack.GetSize() != 0) {
00507                       theStack.Push(tempStack.Pop());
00508                     }
00509                   } // else ignore a bogus end tag.
00510                 }
00511               } // if (theLastToken)
00512             }
00513             break;
00514           default:
00515             break; 
00516         }
00517       }
00518     }
00519 
00520     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
00521   }
00522 
00523   return result;
00524 }
00525 
00532 nsresult nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
00533 {
00534   return ScanDocStructure(aFinalChunk);
00535 }
00536 
00549 nsresult nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner,PRBool& aFlushTokens)
00550 {
00551   PRUnichar theChar;
00552   CToken* theToken=0;
00553 
00554   nsresult result=aScanner.Peek(theChar);
00555 
00556   switch(result) {
00557     case kEOF:
00558       // Tell our caller that'we finished.
00559       return result;
00560 
00561     case NS_OK:
00562     default:
00563 
00564       if(!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
00565         if(kLessThan==theChar) {
00566           return ConsumeTag(theChar,theToken,aScanner,aFlushTokens);
00567         }
00568         else if(kAmpersand==theChar){
00569           return ConsumeEntity(theChar,theToken,aScanner);
00570         }
00571       }
00572       
00573       if((kCR==theChar) || (kLF==theChar)) {
00574         return ConsumeNewline(theChar,theToken,aScanner);
00575       }
00576       else {
00577         if(!nsCRT::IsAsciiSpace(theChar)) {
00578           if(theChar!=nsnull) { 
00579             result=ConsumeText(theToken,aScanner); 
00580           } 
00581           else { 
00582             aScanner.GetChar(theChar); // skip the embedded null char. Fix bug 64098. 
00583           } 
00584           break;
00585         }
00586         result=ConsumeWhitespace(theChar,theToken,aScanner);
00587       } 
00588       break; 
00589   }
00590 
00591   return result;
00592 }
00593 
00594 
00608 nsresult nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
00609                                      CToken*& aToken,
00610                                      nsScanner& aScanner,
00611                                      PRBool& aFlushTokens)
00612 {
00613   PRUnichar theNextChar, oldChar;
00614   nsresult result=aScanner.Peek(aChar,1);
00615 
00616   if(NS_OK==result) {
00617 
00618     switch(aChar) {
00619       case kForwardSlash:
00620         result=aScanner.Peek(theNextChar, 2);
00621 
00622         if(NS_OK==result) {
00623           // Get the original "<" (we've already seen it with a Peek)
00624           aScanner.GetChar(oldChar);
00625 
00626           // XML allows non ASCII tag names, consume this as an end tag. This
00627           // is needed to make XML view source work
00628           PRBool isXML=(mFlags & NS_IPARSER_FLAG_XML);
00629           if(nsCRT::IsAsciiAlpha(theNextChar)||(kGreaterThan==theNextChar)|| 
00630              (isXML && (! nsCRT::IsAscii(theNextChar)))) { 
00631             result=ConsumeEndTag(aChar,aToken,aScanner);
00632           }
00633           else result=ConsumeComment(aChar,aToken,aScanner);
00634         }
00635 
00636         break;
00637 
00638       case kExclamation:
00639         result=aScanner.Peek(theNextChar, 2);
00640 
00641         if(NS_OK==result) {
00642           // Get the original "<" (we've already seen it with a Peek)
00643           aScanner.GetChar(oldChar);
00644 
00645           if((kMinus==theNextChar) || (kGreaterThan==theNextChar)) {
00646             result=ConsumeComment(aChar,aToken,aScanner);
00647           }
00648           else
00649             result=ConsumeSpecialMarkup(aChar,aToken,aScanner); 
00650         }
00651         break;
00652 
00653       case kQuestionMark: // It must be an XML processing instruction...
00654         // Get the original "<" (we've already seen it with a Peek)
00655         aScanner.GetChar(oldChar);
00656         result=ConsumeProcessingInstruction(aChar,aToken,aScanner);
00657         break;
00658 
00659       default:
00660         // XML allows non ASCII tag names, consume this as a start tag.
00661         PRBool isXML=(mFlags & NS_IPARSER_FLAG_XML);
00662         if(nsCRT::IsAsciiAlpha(aChar) ||
00663             (isXML && (! nsCRT::IsAscii(aChar)))) { 
00664           // Get the original "<" (we've already seen it with a Peek)
00665           aScanner.GetChar(oldChar);
00666           result=ConsumeStartTag(aChar,aToken,aScanner,aFlushTokens);
00667         }
00668         else {
00669           // We are not dealing with a tag. So, don't consume the original
00670           // char and leave the decision to ConsumeText().
00671           result=ConsumeText(aToken,aScanner);
00672         }
00673     }
00674   }
00675  
00676   // Last ditch attempt to make sure we don't lose data.
00677   if (kEOF == result && !aScanner.IsIncremental()) {
00678     // Whoops, we don't want to lose any data! Consume the rest as text.
00679     // This normally happens for either a trailing < or </
00680     result = ConsumeText(aToken,aScanner);
00681   }
00682 
00683   return result;
00684 }
00685 
00695 nsresult nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
00696                                             CToken* aToken,
00697                                             nsScanner& aScanner)
00698 {
00699   PRBool done=PR_FALSE;
00700   nsresult result=NS_OK;
00701   PRInt16 theAttrCount=0;
00702 
00703   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
00704 
00705   while((!done) && (result==NS_OK)) {
00706     CAttributeToken* theToken =
00707       NS_STATIC_CAST(CAttributeToken*,
00708                      theAllocator->CreateTokenOfType(eToken_attribute,
00709                                                      eHTMLTag_unknown));
00710     if(theToken){
00711       // Tell the new token to finish consuming text...
00712       result=theToken->Consume(aChar,aScanner,mFlags);
00713  
00714       // Much as I hate to do this, here's some special case code.
00715       // This handles the case of empty-tags in XML. Our last
00716       // attribute token will come through with a text value of ""
00717       // and a textkey of "/". We should destroy it.
00718       if(NS_SUCCEEDED(result)) {
00719         PRBool isUsableAttr = PR_TRUE;
00720         const nsSubstring& key=theToken->GetKey();
00721         const nsAString& text=theToken->GetValue();
00722 
00723         if(!key.IsEmpty() && kForwardSlash==key.First() && text.IsEmpty()) {
00724           if(!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00725             // We only care about these in view-source.
00726             isUsableAttr = PR_FALSE;
00727           }
00728         }
00729         if(isUsableAttr) {
00730           ++theAttrCount;
00731           AddToken((CToken*&)theToken,result,&mTokenDeque,theAllocator);
00732         }
00733         else {
00734           IF_FREE(theToken, mTokenAllocator);
00735         }
00736       }
00737       else {
00738         IF_FREE(theToken, mTokenAllocator);
00739         // Bad attributes are not a reason to set empty.
00740         if(NS_ERROR_HTMLPARSER_BADATTRIBUTE==result) {
00741           result=NS_OK;
00742         } else {
00743           aToken->SetEmpty(PR_TRUE);
00744         }
00745       }
00746     }
00747     
00748 #ifdef DEBUG
00749     if(NS_SUCCEEDED(result)){
00750       PRInt32 newline = 0;
00751       result = aScanner.SkipWhitespace(newline);
00752       NS_ASSERTION(newline == 0, "CAttribute::Consume() failed to collect all the newlines!");
00753     }
00754 #endif
00755     if (NS_SUCCEEDED(result)) {
00756       result = aScanner.Peek(aChar);
00757       if (NS_SUCCEEDED(result)) {
00758         if (aChar == kGreaterThan) { // You just ate the '>'
00759           aScanner.GetChar(aChar); // Skip the '>'
00760           done = PR_TRUE;
00761         }
00762         else if(aChar == kLessThan) {
00763           aToken->SetInError(PR_TRUE);
00764           done = PR_TRUE;
00765         }
00766       }
00767     }
00768   } // End while
00769 
00770   if (NS_FAILED(result)) {
00771     aToken->SetInError(PR_TRUE);
00772 
00773     if (!aScanner.IsIncremental()) {
00774       result = NS_OK;
00775     }
00776   }
00777 
00778   aToken->SetAttributeCount(theAttrCount);
00779   return result;
00780 }
00781 
00793 nsresult nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
00794                                           CToken*& aToken,
00795                                           nsScanner& aScanner,
00796                                           PRBool& aFlushTokens)
00797 {
00798   // Remember this for later in case you have to unwind...
00799   PRInt32 theDequeSize=mTokenDeque.GetSize();
00800   nsresult result=NS_OK;
00801 
00802   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
00803   aToken=theAllocator->CreateTokenOfType(eToken_start,eHTMLTag_unknown);
00804   
00805   if(aToken) {
00806     // Tell the new token to finish consuming text...
00807     result= aToken->Consume(aChar,aScanner,mFlags);
00808 
00809     if(NS_SUCCEEDED(result)) {
00810       AddToken(aToken,result,&mTokenDeque,theAllocator);
00811 
00812       eHTMLTags theTag=(eHTMLTags)aToken->GetTypeID();
00813 
00814       // Good. Now, let's see if the next char is ">".
00815       // If so, we have a complete tag, otherwise, we have attributes.
00816       result = aScanner.Peek(aChar);
00817       if (NS_FAILED(result)) {
00818         aToken->SetInError(PR_TRUE);
00819 
00820         // Don't return early here so we can create a text and end token for
00821         // the special <iframe>, <script> and similar tags down below.
00822         result = NS_OK;
00823       }
00824       else {
00825         if(kGreaterThan != aChar) { // Look for a '>'
00826           result = ConsumeAttributes(aChar, aToken, aScanner);
00827         }
00828         else {
00829           aScanner.GetChar(aChar);
00830         }
00831       }
00832 
00833       /*  Now that that's over with, we have one more problem to solve.
00834           In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
00835           consume all the content itself.
00836           But XML doesn't treat these tags differently, so we shouldn't if the
00837           document is XML.
00838        */
00839       if(NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
00840         PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
00841         PRBool isPCDATA = eHTMLTag_textarea == theTag ||
00842                           eHTMLTag_title    == theTag;
00843 
00844         // XXX This is an evil hack, we should be able to handle these properly
00845         // in the DTD.
00846         if ((eHTMLTag_iframe == theTag   && (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
00847             (eHTMLTag_noframes == theTag && (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
00848             (eHTMLTag_noscript == theTag && (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
00849             (eHTMLTag_noembed == theTag)) {
00850           isCDATA = PR_TRUE;
00851         }
00852 
00853         // Plaintext contains CDATA, but it's special, so we handle it
00854         // differently than the other CDATA elements
00855         if (eHTMLTag_plaintext == theTag) {
00856           isCDATA = PR_FALSE;
00857 
00858           // Note: We check in ConsumeToken() for this flag, and if we see it
00859           // we only construct text tokens (which is what we want).
00860           mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
00861         }
00862 
00863 
00864         if (isCDATA || isPCDATA) {
00865           PRBool done = PR_FALSE;
00866           nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag)); 
00867 
00868           CToken* text =
00869               theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
00870           CTextToken* textToken = NS_STATIC_CAST(CTextToken*,text);
00871 
00872           if (isCDATA) {
00873             // The only tags that consume conservatively are <script> and
00874             // <style>, the rest all consume until the end of the document.
00875             result = textToken->ConsumeCharacterData(theTag==eHTMLTag_script ||
00876                                                      theTag==eHTMLTag_style,
00877                                                      theTag!=eHTMLTag_script,
00878                                                      aScanner,
00879                                                      endTagName,
00880                                                      mFlags,
00881                                                      done);
00882 
00883             // Only flush tokens for <script>, to give ourselves more of a
00884             // chance of allowing inlines to contain blocks.
00885             aFlushTokens = done && theTag == eHTMLTag_script;
00886           }
00887           else if (isPCDATA) {
00888             // Title is consumed conservatively in order to not regress
00889             // bug 42945
00890             result = textToken->ConsumeParsedCharacterData(
00891                                                         theTag==eHTMLTag_textarea,
00892                                                         theTag==eHTMLTag_title,
00893                                                         aScanner,
00894                                                         endTagName,
00895                                                         mFlags,
00896                                                         done);
00897 
00898             // Note: we *don't* set aFlushTokens here.
00899           }
00900 
00901           // We want to do this unless result is kEOF, in which case we will
00902           // simply unwind our stack and wait for more data anyway.
00903           if (kEOF != result) {
00904             AddToken(text,NS_OK,&mTokenDeque,theAllocator);
00905             CToken* endToken = nsnull;
00906             
00907             if (NS_SUCCEEDED(result) && done) {
00908               PRUnichar theChar;
00909               // Get the <
00910               result = aScanner.GetChar(theChar);
00911               NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
00912                            "CTextToken::Consume*Data is broken!");
00913 #ifdef DEBUG
00914               // Ensure we have a /
00915               PRUnichar tempChar;  // Don't change non-debug vars in debug-only code
00916               result = aScanner.Peek(tempChar);
00917               NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
00918                            "CTextToken::Consume*Data is broken!");
00919 #endif
00920               result = ConsumeEndTag(PRUnichar('/'),endToken,aScanner);
00921             } else if (result == kFakeEndTag && 
00922                       !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00923               result = NS_OK;
00924               endToken=theAllocator->CreateTokenOfType(eToken_end,theTag,endTagName);
00925               AddToken(endToken,result,&mTokenDeque,theAllocator);
00926             } else if (result == kFakeEndTag) {
00927               // If we are here, we are both faking having seen the end tag
00928               // and are in view-source.
00929               result = NS_OK;
00930             }
00931           }
00932           else {
00933             IF_FREE(text, mTokenAllocator);
00934           }
00935         }
00936       }
00937 
00938       // This code is confusing, so pay attention.
00939       // If you're here, it's because we were in the midst of consuming a start
00940       // tag but ran out of data (not in the stream, but in this *part* of the
00941       // stream. For simplicity, we have to unwind our input. Therefore, we pop
00942       // and discard any new tokens we've cued this round. Later we can get 
00943       // smarter about this.
00944       if(NS_FAILED(result)) {
00945         while(mTokenDeque.GetSize()>theDequeSize) {
00946           CToken* theToken=(CToken*)mTokenDeque.Pop();
00947           IF_FREE(theToken, mTokenAllocator);
00948         }
00949       }
00950     }
00951     else IF_FREE(aToken, mTokenAllocator);
00952   }
00953   return result;
00954 }
00955 
00964 nsresult nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
00965                                         CToken*& aToken,
00966                                         nsScanner& aScanner)
00967 { 
00968   // Get the "/" (we've already seen it with a Peek)
00969   aScanner.GetChar(aChar);
00970 
00971   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
00972   aToken=theAllocator->CreateTokenOfType(eToken_end,eHTMLTag_unknown);
00973   // Remember this for later in case you have to unwind...
00974   PRInt32 theDequeSize=mTokenDeque.GetSize();
00975   nsresult result=NS_OK;
00976   
00977   if(aToken) {
00978     // Tell the new token to finish consuming text...
00979     result= aToken->Consume(aChar,aScanner,mFlags);
00980     AddToken(aToken,result,&mTokenDeque,theAllocator);
00981     if (NS_FAILED(result)) {
00982       // Note that this early-return here is safe because we have not yet
00983       // added any of our tokens to the queue (AddToken only adds the token if
00984       // result is a success), so we don't need to fall through.
00985       return result;
00986     }
00987 
00988     result = aScanner.Peek(aChar);
00989     if (NS_FAILED(result)) {
00990       aToken->SetInError(PR_TRUE);
00991 
00992       // Note: We know here that the scanner is not incremental since if
00993       // this peek fails, then we've already masked over a kEOF coming from
00994       // the Consume() call above.
00995       return NS_OK;
00996     }
00997 
00998     if(kGreaterThan != aChar) {
00999       result = ConsumeAttributes(aChar, aToken, aScanner);
01000     }
01001     else {
01002       aScanner.GetChar(aChar);
01003     }        
01004 
01005     // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
01006     // out of room in this *section* of the document, pop all of the tokens
01007     // we've consumed this round and wait for more data.
01008     if(NS_FAILED(result)) {
01009       while(mTokenDeque.GetSize()>theDequeSize) {
01010         CToken* theToken=(CToken*)mTokenDeque.Pop();
01011         IF_FREE(theToken, mTokenAllocator);
01012       }
01013     }
01014   }
01015   return result;
01016 }
01017 
01027 nsresult nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
01028                                         CToken*& aToken,
01029                                         nsScanner& aScanner)
01030 {
01031   PRUnichar  theChar;
01032   nsresult result=aScanner.Peek(theChar, 1);
01033 
01034   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01035   if (NS_SUCCEEDED(result)) {
01036     if (nsCRT::IsAsciiAlpha(theChar) || theChar==kHashsign) {
01037       aToken = theAllocator->CreateTokenOfType(eToken_entity,eHTMLTag_entity);
01038       result=aToken->Consume(theChar,aScanner,mFlags);
01039 
01040       if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
01041         IF_FREE(aToken, mTokenAllocator);
01042       }
01043       else {
01044         if (!aScanner.IsIncremental() && result == kEOF) {
01045           result=NS_OK; // Use as much of the entity as you can get.
01046         }
01047         AddToken(aToken,result,&mTokenDeque,theAllocator);
01048         return result;
01049       }
01050     }
01051     // Oops, we're actually looking at plain text...
01052     result = ConsumeText(aToken,aScanner);
01053   }
01054   else if (result == kEOF && !aScanner.IsIncremental()) {
01055     // If the last character in the file is an &, consume it as text.
01056     result = ConsumeText(aToken, aScanner);
01057     if (aToken) {
01058       aToken->SetInError(PR_TRUE);
01059     }
01060   }
01061   return result;
01062 }
01063 
01064 
01074 nsresult nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
01075                                             CToken*& aToken,
01076                                             nsScanner& aScanner)
01077 {
01078   // Get the whitespace character
01079   aScanner.GetChar(aChar);
01080 
01081   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01082   aToken = theAllocator->CreateTokenOfType(eToken_whitespace,eHTMLTag_whitespace);
01083   nsresult result=NS_OK;
01084   if(aToken) {
01085     result=aToken->Consume(aChar,aScanner,mFlags);
01086     AddToken(aToken,result,&mTokenDeque,theAllocator);
01087   }
01088   return result;
01089 }
01090 
01100 nsresult nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
01101                                          CToken*& aToken,
01102                                          nsScanner& aScanner)
01103 {
01104   // Get the "!"
01105   aScanner.GetChar(aChar);
01106 
01107   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01108   aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
01109   nsresult result=NS_OK;
01110   if(aToken) {
01111     result=aToken->Consume(aChar,aScanner,mFlags);
01112     AddToken(aToken,result,&mTokenDeque,theAllocator);
01113   }
01114 
01115   if (kNotAComment == result) {
01116     // AddToken has IF_FREE()'d our token, so...
01117     result = ConsumeText(aToken, aScanner);
01118   }
01119 
01120   return result;
01121 }
01122 
01133 nsresult nsHTMLTokenizer::ConsumeText(CToken*& aToken,nsScanner& aScanner)
01134 {
01135   nsresult result=NS_OK;
01136   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01137   CTextToken* theToken = (CTextToken*)theAllocator->CreateTokenOfType(eToken_text,eHTMLTag_text);
01138   if(theToken) {
01139     PRUnichar ch=0;
01140     result=theToken->Consume(ch,aScanner,mFlags);
01141     if(NS_FAILED(result)) {
01142       if(0==theToken->GetTextLength()){
01143         IF_FREE(aToken, mTokenAllocator);
01144         aToken = nsnull;
01145       }
01146       else result=NS_OK;
01147     }
01148     aToken = theToken;
01149     AddToken(aToken,result,&mTokenDeque,theAllocator);
01150   }
01151   return result;
01152 }
01153 
01163 nsresult nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
01164                                                CToken*& aToken,
01165                                                nsScanner& aScanner)
01166 {
01167   // Get the "!"
01168   aScanner.GetChar(aChar);
01169 
01170   nsresult result=NS_OK;
01171   nsAutoString theBufCopy;
01172   aScanner.Peek(theBufCopy, 20);
01173   ToUpperCase(theBufCopy);
01174   PRInt32 theIndex=theBufCopy.Find("DOCTYPE");
01175   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01176   
01177   if(theIndex==kNotFound) {
01178     if('['==theBufCopy.CharAt(0)) {
01179       aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,eHTMLTag_comment);  
01180     } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
01181                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) || 
01182                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY")) || 
01183                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
01184       aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,eHTMLTag_markupDecl);
01185     } else {
01186       aToken = theAllocator->CreateTokenOfType(eToken_comment,eHTMLTag_comment);
01187     }
01188   }
01189   else
01190     aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,eHTMLTag_doctypeDecl);
01191   
01192   if(aToken) {
01193     result=aToken->Consume(aChar,aScanner,mFlags);
01194     AddToken(aToken,result,&mTokenDeque,theAllocator);
01195   }
01196 
01197   if (result == kNotAComment) {
01198     result = ConsumeText(aToken, aScanner);
01199   }
01200 
01201   return result;
01202 }
01203 
01212 nsresult nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
01213                                          CToken*& aToken,
01214                                          nsScanner& aScanner)
01215 {
01216   // Get the newline character
01217   aScanner.GetChar(aChar);
01218 
01219   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01220   aToken=theAllocator->CreateTokenOfType(eToken_newline,eHTMLTag_newline);
01221   nsresult result=NS_OK;
01222   if(aToken) {
01223     result=aToken->Consume(aChar,aScanner,mFlags);
01224     AddToken(aToken,result,&mTokenDeque,theAllocator);
01225   }
01226   return result;
01227 }
01228 
01229 
01238 nsresult nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
01239                                                        CToken*& aToken,
01240                                                        nsScanner& aScanner)
01241 {
01242   // Get the "?"
01243   aScanner.GetChar(aChar);
01244 
01245   nsTokenAllocator* theAllocator=this->GetTokenAllocator();
01246   aToken=theAllocator->CreateTokenOfType(eToken_instruction,eHTMLTag_unknown);
01247   nsresult result=NS_OK;
01248   if(aToken) {
01249     result=aToken->Consume(aChar,aScanner,mFlags);
01250     AddToken(aToken,result,&mTokenDeque,theAllocator);
01251   }
01252   return result;
01253 }