Back to index

lightning-sunbird  0.9+nobinonly
nsHTMLTokens.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* vim: set ts=2 sw=2 et tw=78: */
00003 /* ***** BEGIN LICENSE BLOCK *****
00004  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00005  *
00006  * The contents of this file are subject to the Mozilla Public License Version
00007  * 1.1 (the "License"); you may not use this file except in compliance with
00008  * the License. You may obtain a copy of the License at
00009  * http://www.mozilla.org/MPL/
00010  *
00011  * Software distributed under the License is distributed on an "AS IS" basis,
00012  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00013  * for the specific language governing rights and limitations under the
00014  * License.
00015  *
00016  * The Original Code is mozilla.org code.
00017  *
00018  * The Initial Developer of the Original Code is
00019  * Netscape Communications Corporation.
00020  * Portions created by the Initial Developer are Copyright (C) 1998
00021  * the Initial Developer. All Rights Reserved.
00022  *
00023  * Contributor(s):
00024  *   Blake Kaplan <mrbkap@gmail.com>
00025  *
00026  * Alternatively, the contents of this file may be used under the terms of
00027  * either of the GNU General Public License Version 2 or later (the "GPL"),
00028  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00029  * in which case the provisions of the GPL or the LGPL are applicable instead
00030  * of those above. If you wish to allow use of your version of this file only
00031  * under the terms of either the GPL or the LGPL, and not to allow others to
00032  * use your version of this file under the terms of the MPL, indicate your
00033  * decision by deleting the provisions above and replace them with the notice
00034  * and other provisions required by the GPL or the LGPL. If you do not delete
00035  * the provisions above, a recipient may use your version of this file under
00036  * the terms of any one of the MPL, the GPL or the LGPL.
00037  *
00038  * ***** END LICENSE BLOCK ***** */
00039 
00040 #include <ctype.h> 
00041 #include <time.h>
00042 #include <stdio.h>  
00043 #include "nsScanner.h"
00044 #include "nsToken.h"
00045 #include "nsIAtom.h"
00046 #include "nsHTMLTokens.h"
00047 #include "prtypes.h"
00048 #include "nsDebug.h"
00049 #include "nsHTMLTags.h"
00050 #include "nsHTMLEntities.h"
00051 #include "nsCRT.h"
00052 #include "nsReadableUtils.h"
00053 #include "nsUnicharUtils.h"
00054 #include "nsScanner.h"
00055 
00056 
00057 static const PRUnichar sUserdefined[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
00058                                          'i', 'n', 'e', 'd', 0};
00059 
00060 static const PRUnichar kAttributeTerminalChars[] = {
00061   PRUnichar('&'), PRUnichar('\t'), PRUnichar('\n'),
00062   PRUnichar('\r'), PRUnichar(' '), PRUnichar('>'),
00063   PRUnichar(0) 
00064 };
00065 
00066 static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue);
00067 /*
00068  *  @param   aScanner -- controller of underlying input source
00069  *  @param   aFlag -- If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
00070  *  @return  error result
00071  *
00072  */
00073 static
00074 nsresult ConsumeEntity(nsScannerSharedSubstring& aString,
00075                        nsScanner& aScanner,
00076                        PRInt32 aFlag) 
00077 {
00078   nsresult result=NS_OK;
00079 
00080   PRUnichar ch;
00081   result=aScanner.Peek(ch, 1);
00082 
00083   if (NS_SUCCEEDED(result)) {
00084     PRUnichar amp=0;
00085     PRInt32 theNCRValue=0;
00086     nsAutoString entity;
00087 
00088     if (nsCRT::IsAsciiAlpha(ch) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00089       result=CEntityToken::ConsumeEntity(ch,entity,aScanner);
00090       if (NS_SUCCEEDED(result)) {
00091         theNCRValue = nsHTMLEntities::EntityToUnicode(entity);
00092         PRUnichar theTermChar=entity.Last();
00093         // If an entity value is greater than 255 then:
00094         // Nav 4.x does not treat it as an entity,
00095         // IE treats it as an entity if terminated with a semicolon.
00096         // Resembling IE!!
00097 
00098         nsSubstring &writable = aString.writable();
00099         if(theNCRValue < 0 || (theNCRValue > 255 && theTermChar != ';')) {
00100           // Looks like we're not dealing with an entity
00101           writable.Append(kAmpersand);
00102           writable.Append(entity);
00103         }
00104         else {
00105           // A valid entity so reduce it.
00106           writable.Append(PRUnichar(theNCRValue));
00107         }
00108       }
00109     }
00110     else if (ch==kHashsign && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00111       result=CEntityToken::ConsumeEntity(ch,entity,aScanner);
00112       if (NS_SUCCEEDED(result)) {
00113         nsSubstring &writable = aString.writable();
00114         if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
00115           // Looked like an entity but it's not
00116           aScanner.GetChar(amp);
00117           writable.Append(amp);
00118           result = NS_OK; // just being safe..
00119         }
00120         else {
00121           PRInt32 err;
00122           theNCRValue=entity.ToInteger(&err,kAutoDetect);
00123           AppendNCR(writable, theNCRValue);
00124         }
00125       }
00126     }
00127     else {
00128       // What we thought as entity is not really an entity...
00129       aScanner.GetChar(amp);
00130       aString.writable().Append(amp);
00131     }//if
00132   }
00133 
00134   return result;
00135 }
00136 
00137 /*
00138  *  This general purpose method is used when you want to
00139  *  consume attributed text value. 
00140  *  Note: It also reduces entities.
00141  *
00142  *  @param   aNewlineCount -- the newline count to increment when hitting newlines
00143  *  @param   aScanner -- controller of underlying input source
00144  *  @param   aTerminalChars -- characters that stop consuming attribute.
00145  *  @param   aAllowNewlines -- whether to allow newlines in the value.
00146  *                             XXX it would be nice to roll this info into
00147  *                             aTerminalChars somehow....
00148  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
00149  *  @return  error result
00150  */
00151 static
00152 nsresult ConsumeUntil(nsScannerSharedSubstring& aString,
00153                       PRInt32& aNewlineCount,
00154                       nsScanner& aScanner,
00155                       const nsReadEndCondition& aEndCondition,
00156                       PRBool aAllowNewlines,
00157                       PRInt32 aFlag)
00158 {
00159   nsresult result = NS_OK;
00160   PRBool   done = PR_FALSE;
00161   
00162   do {
00163     result = aScanner.ReadUntil(aString,aEndCondition,PR_FALSE);
00164     if(NS_SUCCEEDED(result)) {
00165       PRUnichar ch;
00166       aScanner.Peek(ch);
00167       if(ch == kAmpersand) {
00168         result = ConsumeEntity(aString,aScanner,aFlag);
00169       }
00170       else if(ch == kCR && aAllowNewlines) {
00171         aScanner.GetChar(ch);
00172         result = aScanner.Peek(ch);
00173         if (NS_SUCCEEDED(result)) {
00174           nsSubstring &writable = aString.writable();
00175           if(ch == kNewLine) {
00176             writable.AppendLiteral("\r\n");
00177             aScanner.GetChar(ch);
00178           }
00179           else {
00180             writable.Append(PRUnichar('\r'));
00181           }
00182           ++aNewlineCount;
00183         }
00184       }
00185       else if(ch == kNewLine && aAllowNewlines) {
00186         aScanner.GetChar(ch);
00187         aString.writable().Append(PRUnichar('\n'));
00188         ++aNewlineCount;
00189       }
00190       else {
00191         done = PR_TRUE;
00192       }
00193     }
00194   } while (NS_SUCCEEDED(result) && !done);
00195 
00196   return result;
00197 }
00198 
00199 /**************************************************************
00200   And now for the token classes...
00201  **************************************************************/
00202 
00203 /*
00204  *  constructor from tag id
00205  *  
00206  *  @update  gess 3/25/98
00207  *  @param   
00208  *  @return  
00209  */
00210 CHTMLToken::CHTMLToken(eHTMLTags aTag) : CToken(aTag) {
00211 }
00212 
00213 
00214 CHTMLToken::~CHTMLToken() {
00215 
00216 }
00217 
00218 /*
00219  *  constructor from tag id
00220  *  
00221  *  @update  gess 3/25/98
00222  *  @param   
00223  *  @return  
00224  */
00225 CStartToken::CStartToken(eHTMLTags aTag) : CHTMLToken(aTag) {
00226   mEmpty=PR_FALSE;
00227   mContainerInfo=eFormUnknown;
00228 #ifdef DEBUG
00229   mAttributed = PR_FALSE;
00230 #endif
00231 }
00232 
00233 CStartToken::CStartToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
00234   mEmpty=PR_FALSE;
00235   mContainerInfo=eFormUnknown;
00236   mTextValue.Assign(aName);
00237 #ifdef DEBUG
00238   mAttributed = PR_FALSE;
00239 #endif
00240 }
00241 
00242 CStartToken::CStartToken(const nsAString& aName,eHTMLTags aTag) : CHTMLToken(aTag) {
00243   mEmpty=PR_FALSE;
00244   mContainerInfo=eFormUnknown;
00245   mTextValue.Assign(aName);
00246 #ifdef DEBUG
00247   mAttributed = PR_FALSE;
00248 #endif
00249 }
00250 
00251 /*
00252  *  This method returns the typeid (the tag type) for this token.
00253  *  
00254  *  @update  gess 3/25/98
00255  *  @param   
00256  *  @return  
00257  */
00258 PRInt32 CStartToken::GetTypeID(){
00259   if(eHTMLTag_unknown==mTypeID) {
00260     mTypeID = nsHTMLTags::LookupTag(mTextValue);
00261   }
00262   return mTypeID;
00263 }
00264 
00265 /*
00266  *  
00267  *  
00268  *  @update  gess 3/25/98
00269  *  @param   
00270  *  @return  
00271  */
00272 PRInt32 CStartToken::GetTokenType(void) {
00273   return eToken_start;
00274 }
00275 
00276 /*
00277  *  
00278  *  
00279  *  @update  gess 3/25/98
00280  *  @param   
00281  *  @return  
00282  */
00283 void CStartToken::SetEmpty(PRBool aValue) {
00284   mEmpty=aValue;
00285 }
00286 
00287 /*
00288  *  
00289  *  
00290  *  @update  gess 3/25/98
00291  *  @param   
00292  *  @return  
00293  */
00294 PRBool CStartToken::IsEmpty(void) {
00295   return mEmpty;
00296 }
00297 
00298 
00299 /*
00300  *  Consume the identifier portion of the start tag
00301  *  
00302  *  @update  gess 3/25/98
00303  *  @param   aChar -- last char consumed from stream
00304  *  @param   aScanner -- controller of underlying input source
00305  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
00306  *  @return  error result
00307  */
00308 nsresult CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
00309 
00310   //if you're here, we've already Consumed the < char, and are
00311    //ready to Consume the rest of the open tag identifier.
00312    //Stop consuming as soon as you see a space or a '>'.
00313    //NOTE: We don't Consume the tag attributes here, nor do we eat the ">"
00314 
00315   nsresult result=NS_OK;
00316   nsScannerSharedSubstring tagIdent;
00317 
00318   if (aFlag & NS_IPARSER_FLAG_HTML) {
00319     result = aScanner.ReadTagIdentifier(tagIdent);
00320     mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
00321     // Save the original tag string if this is user-defined or if we
00322     // are viewing source
00323     if(eHTMLTag_userdefined==mTypeID || (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00324       mTextValue = tagIdent.str();
00325     }
00326   }
00327   else {
00328     result = aScanner.ReadTagIdentifier(tagIdent);
00329     mTextValue = tagIdent.str();
00330     mTypeID = nsHTMLTags::LookupTag(mTextValue);
00331   }
00332 
00333   if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00334     result = aScanner.SkipWhitespace(mNewlineCount);
00335   }
00336 
00337   if (kEOF == result && !aScanner.IsIncremental()) {
00338     // Take what we can get.
00339     result = NS_OK;
00340   }
00341 
00342   return result;
00343 }
00344 
00345 
00346 const nsSubstring& CStartToken::GetStringValue()
00347 {
00348   if((eHTMLTag_unknown<mTypeID) && (mTypeID<eHTMLTag_text)) {
00349     if(!mTextValue.Length()) {
00350       mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
00351     }
00352   }
00353   return mTextValue;
00354 }
00355 
00356 /*
00357  *  
00358  *  
00359  *  @update  gess 3/25/98
00360  *  @param   anOutputString will recieve the result
00361  *  @return  nada
00362  */
00363 void CStartToken::GetSource(nsString& anOutputString){
00364   anOutputString.Truncate();
00365   AppendSourceTo(anOutputString);
00366 }
00367 
00368 /*
00369  *  
00370  *  
00371  *  @update  harishd 03/23/00
00372  *  @param   result appended to the output string.
00373  *  @return  nada
00374  */
00375 void CStartToken::AppendSourceTo(nsAString& anOutputString){
00376   anOutputString.Append(PRUnichar('<'));
00377   /*
00378    * Watch out for Bug 15204 
00379    */
00380   if(!mTextValue.IsEmpty())
00381     anOutputString.Append(mTextValue);
00382   else
00383     anOutputString.Append(GetTagName(mTypeID));
00384 
00385   anOutputString.Append(PRUnichar('>'));
00386 }
00387 
00388 /*
00389  *  constructor from tag id
00390  *  
00391  *  @update  gess 3/25/98
00392  *  @param   
00393  *  @return  
00394  */
00395 CEndToken::CEndToken(eHTMLTags aTag) : CHTMLToken(aTag) {
00396 }
00397 
00398 CEndToken::CEndToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
00399   mTextValue.Assign(aName);
00400 }
00401 
00402 CEndToken::CEndToken(const nsAString& aName,eHTMLTags aTag) : CHTMLToken(aTag) {
00403   mTextValue.Assign(aName);
00404 }
00405 
00406 /*
00407  *  Consume the identifier portion of the end tag
00408  *  
00409  *  @update  gess 3/25/98
00410  *  @param   aChar -- last char consumed from stream
00411  *  @param   aScanner -- controller of underlying input source
00412  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
00413  *  @return  error result
00414  */
00415 nsresult CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) 
00416 {
00417   nsresult result = NS_OK;
00418   nsScannerSharedSubstring tagIdent;
00419 
00420   if (aFlag & NS_IPARSER_FLAG_HTML) {
00421     result = aScanner.ReadTagIdentifier(tagIdent);
00422     
00423     mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
00424     // Save the original tag string if this is user-defined or if we
00425     // are viewing source
00426     if(eHTMLTag_userdefined==mTypeID || 
00427        (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00428       mTextValue = tagIdent.str();
00429     }
00430   }
00431   else {
00432     result = aScanner.ReadTagIdentifier(tagIdent);
00433     mTextValue = tagIdent.str();
00434     mTypeID = nsHTMLTags::LookupTag(mTextValue);
00435   }
00436 
00437   if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00438     result = aScanner.SkipWhitespace(mNewlineCount);
00439   }
00440 
00441   if (kEOF == result && !aScanner.IsIncremental()) {
00442     // Take what we can get.
00443     result = NS_OK;
00444   }
00445 
00446   return result;
00447 }
00448 
00449 
00450 /*
00451  *  Asks the token to determine the <i>HTMLTag type</i> of
00452  *  the token. This turns around and looks up the tag name
00453  *  in the tag dictionary.
00454  *  
00455  *  @update  gess 3/25/98
00456  *  @param   
00457  *  @return  eHTMLTag id of this endtag
00458  */
00459 PRInt32 CEndToken::GetTypeID(){
00460   if(eHTMLTag_unknown==mTypeID) {
00461     mTypeID = nsHTMLTags::LookupTag(mTextValue);
00462     switch(mTypeID) {
00463       case eHTMLTag_dir:
00464       case eHTMLTag_menu:
00465         mTypeID=eHTMLTag_ul;
00466         break;
00467       default:
00468         break;
00469     }
00470   }
00471   return mTypeID;
00472 }
00473 
00474 /*
00475  *  
00476  *  
00477  *  @update  gess 3/25/98
00478  *  @param   
00479  *  @return  
00480  */
00481 PRInt32 CEndToken::GetTokenType(void) {
00482   return eToken_end;
00483 }
00484 
00485 const nsSubstring& CEndToken::GetStringValue()
00486 {
00487   if((eHTMLTag_unknown<mTypeID) && (mTypeID<eHTMLTag_text)) {
00488     if(!mTextValue.Length()) {
00489       mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
00490     }
00491   }
00492   return mTextValue;
00493 }
00494 
00495 /*
00496  *  
00497  *  
00498  *  @update  gess 3/25/98
00499  *  @param   anOutputString will recieve the result
00500  *  @return  nada
00501  */
00502 void CEndToken::GetSource(nsString& anOutputString){
00503   anOutputString.Truncate();
00504   AppendSourceTo(anOutputString);
00505 }
00506 
00507 /*
00508  *  
00509  *  
00510  *  @update  harishd 03/23/00
00511  *  @param   result appended to the output string.
00512  *  @return  nada
00513  */
00514 void CEndToken::AppendSourceTo(nsAString& anOutputString){
00515   anOutputString.AppendLiteral("</");
00516   if(!mTextValue.IsEmpty())
00517     anOutputString.Append(mTextValue);
00518   else
00519     anOutputString.Append(GetTagName(mTypeID));
00520 
00521   anOutputString.Append(PRUnichar('>'));
00522 }
00523 
00524 /*
00525  *  default constructor
00526  *  
00527  *  @update  gess 3/25/98
00528  *  @param   aName -- string to init token name with
00529  *  @return  
00530  */
00531 CTextToken::CTextToken() : CHTMLToken(eHTMLTag_text) {
00532 }
00533 
00534 
00535 /*
00536  *  string based constructor
00537  *  
00538  *  @update  gess 3/25/98
00539  *  @param   aName -- string to init token name with
00540  *  @return  
00541  */
00542 CTextToken::CTextToken(const nsAString& aName) : CHTMLToken(eHTMLTag_text) {
00543   mTextValue.Rebind(aName);
00544 }
00545 
00546 /*
00547  *  
00548  *  
00549  *  @update  gess 3/25/98
00550  *  @param   
00551  *  @return  
00552  */
00553 PRInt32 CTextToken::GetTokenType(void) {
00554   return eToken_text;
00555 }
00556 
00557 PRInt32 CTextToken::GetTextLength(void) {
00558   return mTextValue.Length();
00559 }
00560 
00561 /*
00562  *  Consume as much clear text from scanner as possible.
00563  *
00564  *  @update  gess 3/25/98
00565  *  @param   aChar -- last char consumed from stream
00566  *  @param   aScanner -- controller of underlying input source
00567  *  @return  error result
00568  */
00569 nsresult CTextToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
00570   static const PRUnichar theTerminalsChars[] = 
00571     { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
00572       PRUnichar(0) };
00573   static const nsReadEndCondition theEndCondition(theTerminalsChars);
00574   nsresult  result=NS_OK;
00575   PRBool    done=PR_FALSE;
00576   nsScannerIterator origin, start, end;
00577   
00578   // Start scanning after the first character, because we know it to
00579   // be part of this text token (we wouldn't have come here if it weren't)
00580   aScanner.CurrentPosition(origin);
00581   start = origin;
00582   aScanner.EndReading(end);
00583 
00584   NS_ASSERTION(start != end, "Calling CTextToken::Consume when already at the "
00585                              "end of a document is a bad idea.");
00586 
00587   aScanner.SetPosition(++start);
00588 
00589   while((NS_OK==result) && (!done)) {
00590     result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
00591     if(NS_OK==result) {
00592       result=aScanner.Peek(aChar);
00593 
00594       if (NS_OK == result && (kCR == aChar || kNewLine == aChar)) {
00595         switch (aChar) {
00596           case kCR:
00597           {
00598             // It's a carriage return. See if this is part of a CR-LF pair (in
00599             // which case we need to treat it as one newline). If we're at the
00600             // edge of a packet, then leave the CR on the scanner, since it
00601             // could still be part of a CR-LF pair. Otherwise, it isn't.
00602             PRUnichar theNextChar;
00603             result = aScanner.Peek(theNextChar, 1);
00604 
00605             if (result == kEOF && aScanner.IsIncremental()) {
00606               break;
00607             }
00608 
00609             if (NS_SUCCEEDED(result)) {
00610               // Actually get the carriage return.
00611               aScanner.GetChar(aChar);
00612             }
00613 
00614             if (kLF == theNextChar) {
00615               // If the "\r" is followed by a "\n", don't replace it and let
00616               // it be ignored by the layout system.
00617               end.advance(2);
00618               aScanner.GetChar(theNextChar);
00619             }
00620             else {
00621               // If it is standalone, replace the "\r" with a "\n" so that it
00622               // will be considered by the layout system.
00623               aScanner.ReplaceCharacter(end, kLF);
00624               ++end;
00625             }
00626             ++mNewlineCount;
00627             break;
00628           }
00629           case kLF:
00630             aScanner.GetChar(aChar);
00631             ++end;
00632             ++mNewlineCount;
00633             break;
00634         } //switch
00635       }
00636       else done=PR_TRUE;
00637     }
00638   }
00639 
00640   // Note: This function is only called from nsHTMLTokenizer::ConsumeText. If
00641   // we return an error result from the final buffer, then it is responsible
00642   // for turning it into an NS_OK result.
00643   aScanner.BindSubstring(mTextValue, origin, end);
00644 
00645   return result;
00646 }
00647 
00648 /*
00649  *  Consume as much clear text from scanner as possible.
00650  *  The scanner is left on the < of the perceived end tag.
00651  *
00652  *  @param   aChar -- last char consumed from stream
00653  *  @param   aConservativeConsume -- controls our handling of content with no
00654  *                                   terminating string.
00655  *  @param   aIgnoreComments -- whether or not we should take comments into
00656  *                              account in looking for the end tag.
00657  *  @param   aScanner -- controller of underlying input source
00658  *  @param   aEndTagname -- the terminal tag name.
00659  *  @param   aFlag -- dtd modes and such.
00660  *  @param   aFlushTokens -- PR_TRUE if we found the terminal tag.
00661  *  @return  error result
00662  */
00663 nsresult CTextToken::ConsumeCharacterData(PRBool aConservativeConsume,
00664                                           PRBool aIgnoreComments,
00665                                           nsScanner& aScanner,
00666                                           const nsAString& aEndTagName,
00667                                           PRInt32 aFlag,
00668                                           PRBool& aFlushTokens) {
00669   nsresult      result=NS_OK;
00670   nsScannerIterator theStartOffset, theCurrOffset, theTermStrPos, theStartCommentPos, theAltTermStrPos, endPos;
00671   PRBool        done=PR_FALSE;
00672   PRBool        theLastIteration=PR_FALSE;
00673 
00674   aScanner.CurrentPosition(theStartOffset);
00675   theCurrOffset = theStartOffset;
00676   aScanner.EndReading(endPos);
00677   theTermStrPos = theStartCommentPos = theAltTermStrPos = endPos;
00678 
00679   // ALGORITHM: *** The performance is based on correctness of the document ***
00680   // 1. Look for a '<' character.  This could be
00681   //    a) Start of a comment (<!--), b) Start of the terminal string, or c) a start of a tag.
00682   //    We are interested in a) and b). c) is ignored because in CDATA we don't care for tags.
00683   //    NOTE: Technically speaking in CDATA we should ignore the comments too!! But for compatibility
00684   //          we don't.
00685   // 2. Having the offset, for '<', search for the terminal string from there on and record its offset.
00686   // 3. From the same '<' offset also search for start of a comment '<!--'. If found search for
00687   //    end comment '-->' between the terminal string and '<!--'.  If you did not find the end
00688   //    comment, then we have a malformed document, i.e., this section has a prematured terminal string
00689   //    Ex. <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But record terminal string's
00690   //    offset if this is the first premature terminal string, and update the current offset to the terminal 
00691   //    string (prematured) offset and goto step 1.
00692   // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
00693   // 5. If the end of the document is reached and if we still don't have the condition in step 4. then
00694   //    assume that the prematured terminal string is the actual terminal string and goto step 1. This
00695   //    will be our last iteration. If there is no premature terminal string and we're being
00696   //    conservative in our consumption (aConservativeConsume), then don't consume anything
00697   //    from the scanner. Otherwise, we consume all the way until the end (for <xmp>).
00698 
00699   NS_NAMED_LITERAL_STRING(ltslash, "</");
00700   const nsString theTerminalString = ltslash + aEndTagName;
00701 
00702   PRUint32 termStrLen=theTerminalString.Length();
00703   while((result == NS_OK) && !done) {
00704     PRBool found = PR_FALSE;
00705     nsScannerIterator gtOffset,ltOffset = theCurrOffset;
00706     while (FindCharInReadable(PRUnichar(kLessThan), ltOffset, endPos) &&
00707            ((PRUint32)ltOffset.size_forward() >= termStrLen ||
00708             Distance(ltOffset, endPos) >= termStrLen)) {
00709       // Make a copy of the (presumed) end tag and
00710       // do a case-insensitive comparison
00711 
00712       nsScannerIterator start(ltOffset), end(ltOffset);
00713       end.advance(termStrLen);
00714 
00715       if (CaseInsensitiveFindInReadable(theTerminalString,start,end) && 
00716           (end == endPos || (*end == '>'  || *end == ' '  || 
00717                              *end == '\t' || *end == '\n' || 
00718                              *end == '\r'))) {
00719         gtOffset = end;
00720         // Note that aIgnoreComments is only not set for <script>. We don't
00721         // want to execute scripts that aren't in the form of: <script\s.*>
00722         if ((end == endPos && aIgnoreComments) || 
00723             FindCharInReadable(PRUnichar(kGreaterThan), gtOffset, endPos)) {
00724           found = PR_TRUE;
00725           theTermStrPos = start;
00726         }
00727         break;
00728       }
00729       ltOffset.advance(1);
00730     }
00731      
00732     if (found && theTermStrPos != endPos) {
00733       if(!(aFlag & NS_IPARSER_FLAG_STRICT_MODE) &&
00734          !theLastIteration && !aIgnoreComments) {
00735         nsScannerIterator endComment(ltOffset);
00736         endComment.advance(5);
00737          
00738         if ((theStartCommentPos == endPos) &&
00739             FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset, endComment)) {
00740           theStartCommentPos = theCurrOffset;
00741         }
00742          
00743         if (theStartCommentPos != endPos) {
00744           // Search for --> between <!-- and </TERMINALSTRING>.
00745           theCurrOffset = theStartCommentPos;
00746           nsScannerIterator terminal(theTermStrPos);
00747           if (!RFindInReadable(NS_LITERAL_STRING("-->"),
00748                                theCurrOffset, terminal)) {
00749             // If you're here it means that we have a bogus terminal string.
00750             // Even though it is bogus, the position of the terminal string
00751             // could be helpful in case we hit the rock bottom.
00752             if (theAltTermStrPos == endPos) {
00753               // But we only want to remember the first bogus terminal string.
00754               theAltTermStrPos = theTermStrPos;
00755             }
00756     
00757             // We did not find '-->' so keep searching for terminal string.
00758             theCurrOffset = theTermStrPos;
00759             theCurrOffset.advance(termStrLen);
00760             continue;
00761           }
00762         }
00763       }
00764 
00765       aScanner.BindSubstring(mTextValue, theStartOffset, theTermStrPos);
00766       aScanner.SetPosition(ltOffset);
00767       
00768       // We found </SCRIPT> or </STYLE>...permit flushing -> Ref: Bug 22485
00769       aFlushTokens=PR_TRUE;
00770       done = PR_TRUE;
00771     }
00772     else {
00773       // We end up here if:
00774       // a) when the buffer runs out ot data.
00775       // b) when the terminal string is not found.
00776       if(!aScanner.IsIncremental()) {
00777         if(theAltTermStrPos != endPos && aConservativeConsume) {
00778           // If you're here it means..we hit the rock bottom and therefore switch to plan B.
00779           theCurrOffset = theAltTermStrPos;
00780           theLastIteration = PR_TRUE;
00781         }
00782         else if (!aConservativeConsume) {
00783           done = PR_TRUE; // Do this to fix Bug. 35456
00784           result = kFakeEndTag;
00785           aScanner.BindSubstring(mTextValue, theStartOffset, endPos);
00786           aScanner.SetPosition(endPos);
00787         }
00788         else {
00789           done = PR_TRUE;
00790           result = kFakeEndTag;
00791           // We need to bind our value to a non-empty string.
00792           aScanner.BindSubstring(mTextValue, theStartOffset, theStartOffset);
00793         }
00794       }
00795       else {
00796         result=kEOF;
00797       }
00798     }
00799   }
00800 
00801   return result;
00802 }
00803 
00804 /*
00805  *  Consume as much clear text from scanner as possible. Reducing entities.
00806  *  The scanner is left on the < of the perceived end tag.
00807  *
00808  *  @param   aChar -- last char consumed from stream
00809  *  @param   aConservativeConsume -- controls our handling of content with no
00810  *                                   terminating string.
00811  *  @param   aScanner -- controller of underlying input source
00812  *  @param   aEndTagname -- the terminal tag name.
00813  *  @param   aFlag -- dtd modes and such.
00814  *  @param   aFlushTokens -- PR_TRUE if we found the terminal tag.
00815  *  @return  error result
00816  */
00817 nsresult CTextToken::ConsumeParsedCharacterData(PRBool aDiscardFirstNewline,
00818                                                 PRBool aConservativeConsume,
00819                                                 nsScanner& aScanner,
00820                                                 const nsAString& aEndTagName,
00821                                                 PRInt32 aFlag,
00822                                                 PRBool& aFound)
00823 {
00824   // This function is fairly straightforward except if there is no terminating
00825   // string. If there is, we simply loop through all of the entities, reducing
00826   // them as necessary and skipping over non-terminal strings starting with <.
00827   // If there is *no* terminal string, then we examine aConservativeConsume.
00828   // If we want to be conservative, we backtrack to the first place in the
00829   // document that looked like the end of PCDATA (i.e., the first tag). This
00830   // is for compatibility and so we don't regress bug 42945. If we are not
00831   // conservative, then we consume everything, all the way up to the end of
00832   // the document.
00833 
00834   static const PRUnichar terminalChars[] = {
00835     PRUnichar('\r'), PRUnichar('\n'), PRUnichar('&'), PRUnichar('<'),
00836     PRUnichar(0)
00837   };
00838   static const nsReadEndCondition theEndCondition(terminalChars);
00839 
00840   nsScannerIterator currPos, endPos, altEndPos;
00841   PRUint32 truncPos = 0;
00842   aScanner.CurrentPosition(currPos);
00843   aScanner.EndReading(endPos);
00844 
00845   altEndPos = endPos;
00846 
00847   nsScannerSharedSubstring theContent;
00848   PRUnichar ch = 0;
00849 
00850   NS_NAMED_LITERAL_STRING(commentStart, "<!--");
00851   NS_NAMED_LITERAL_STRING(ltslash, "</");
00852   const nsString theTerminalString = ltslash + aEndTagName;
00853   PRUint32 termStrLen = theTerminalString.Length();
00854   PRUint32 commentStartLen = commentStart.Length();
00855 
00856   nsresult result = NS_OK;
00857 
00858   // Note that if we're already at the end of the document, the ConsumeUntil
00859   // will fail, and we'll do the right thing.
00860   do {
00861     result = ConsumeUntil(theContent, mNewlineCount, aScanner, 
00862                           theEndCondition, PR_TRUE, aFlag);
00863 
00864     if (aDiscardFirstNewline && 
00865         (NS_SUCCEEDED(result) || !aScanner.IsIncremental()) &&
00866         !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
00867       // Check if the very first character is a newline, and if so discard it.
00868       // Note that we don't want to discard it in view source!
00869       // Also note that this has to happen here (as opposed to before the
00870       // ConsumeUntil) because we have to expand any entities.
00871       // XXX It would be nice to be able to do this without calling
00872       // writable()!
00873       const nsSubstring &firstChunk = theContent.str();
00874       if (!firstChunk.IsEmpty()) {
00875         PRUint32 where = 0;
00876         PRUnichar newline = firstChunk.First();
00877 
00878         if (newline == kCR || newline == kNewLine) {
00879           ++where;
00880 
00881           if (firstChunk.Length() > 1) {
00882             if (newline == kCR && firstChunk.CharAt(1) == kNewLine) {
00883               // Handle \r\n = 1 newline.
00884               ++where;
00885             }
00886             // Note: \n\r = 2 newlines.
00887           }
00888         }
00889 
00890         if (where != 0) {
00891           theContent.writable() = Substring(firstChunk, where);
00892         }
00893       }
00894     }
00895     aDiscardFirstNewline = PR_FALSE;
00896 
00897     if (NS_FAILED(result)) {
00898       if (kEOF == result && !aScanner.IsIncremental()) {
00899         aFound = PR_TRUE; // this is as good as it gets.
00900         result = kFakeEndTag;
00901 
00902         if (aConservativeConsume && altEndPos != endPos) {
00903           // We ran out of room looking for a </title>. Go back to the first
00904           // place that looked like a tag and use that as our stopping point.
00905           theContent.writable().Truncate(truncPos);
00906           aScanner.SetPosition(altEndPos, PR_FALSE, PR_TRUE);
00907         }
00908         // else we take everything we consumed.
00909         mTextValue.Rebind(theContent.str());
00910       }
00911       else {
00912         aFound = PR_FALSE;
00913       }
00914 
00915       return result;
00916     }
00917 
00918     aScanner.CurrentPosition(currPos);
00919     aScanner.GetChar(ch); // this character must be '&' or '<'
00920 
00921     if (ch == kLessThan && altEndPos == endPos) {
00922       // Keep this position in case we need it for later.
00923       altEndPos = currPos;
00924       truncPos = theContent.str().Length();
00925     }
00926 
00927     if (Distance(currPos, endPos) >= termStrLen) {
00928       nsScannerIterator start(currPos), end(currPos);
00929       end.advance(termStrLen);
00930 
00931       if (CaseInsensitiveFindInReadable(theTerminalString,start,end)) {
00932         if (end != endPos && (*end == '>'  || *end == ' '  || 
00933                               *end == '\t' || *end == '\n' || 
00934                               *end == '\r')) {
00935           aFound = PR_TRUE;
00936           mTextValue.Rebind(theContent.str());
00937 
00938           // Note: This SetPosition() is actually going backwards from the
00939           // scanner's mCurrentPosition (so we pass aReverse == PR_TRUE). This
00940           // is because we call GetChar() above after we get the current
00941           // position.
00942           aScanner.SetPosition(currPos, PR_FALSE, PR_TRUE);
00943           break;
00944         }
00945       }
00946     }
00947     // IE only consumes <!-- --> as comments in PCDATA.
00948     if (Distance(currPos, endPos) >= commentStartLen) {
00949       nsScannerIterator start(currPos), end(currPos);
00950       end.advance(commentStartLen);
00951 
00952       if (CaseInsensitiveFindInReadable(commentStart,start,end)) {
00953         CCommentToken consumer; // stack allocated.
00954 
00955         // CCommentToken expects us to be on the '-'
00956         aScanner.SetPosition(currPos.advance(2));
00957 
00958         // In quirks mode we consume too many things as comments, so pretend
00959         // that we're not by modifying aFlag.
00960         result = consumer.Consume(*currPos, aScanner, 
00961          (aFlag & ~NS_IPARSER_FLAG_QUIRKS_MODE) | NS_IPARSER_FLAG_STRICT_MODE);
00962         if (kEOF == result) {
00963           return kEOF; // this can only happen if we're really out of space.
00964         }
00965         else if (kNotAComment == result) {
00966           // Fall through and consume this as text.
00967           aScanner.CurrentPosition(currPos);
00968           aScanner.SetPosition(currPos.advance(1));
00969         }
00970         else {
00971           consumer.AppendSourceTo(theContent.writable());
00972           mNewlineCount += consumer.GetNewlineCount();
00973           continue;
00974         }
00975       }
00976     }
00977 
00978     result = kEOF;
00979     // We did not find the terminal string yet so
00980     // include the character that stopped consumption.
00981     theContent.writable().Append(ch);
00982   } while (currPos != endPos);
00983 
00984   return result;
00985 }
00986 
00987 void CTextToken::CopyTo(nsAString& aStr)
00988 {
00989   nsScannerIterator start, end;
00990   mTextValue.BeginReading(start);
00991   mTextValue.EndReading(end);
00992   CopyUnicodeTo(start, end, aStr);
00993 }
00994 
00995 const nsSubstring& CTextToken::GetStringValue(void)
00996 {
00997   return mTextValue.AsString();
00998 }
00999 
01000 void CTextToken::Bind(nsScanner* aScanner, nsScannerIterator& aStart, nsScannerIterator& aEnd)
01001 {
01002   aScanner->BindSubstring(mTextValue, aStart, aEnd);
01003 }
01004 
01005 void CTextToken::Bind(const nsAString& aStr)
01006 {
01007   mTextValue.Rebind(aStr);
01008 }
01009 
01010 /*
01011  *  default constructor
01012  *  
01013  *  @update  vidur 11/12/98
01014  *  @param   aName -- string to init token name with
01015  *  @return  
01016  */
01017 CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag) : CHTMLToken(aTag) {
01018 }
01019 
01020 
01021 /*
01022  *  string based constructor
01023  *  
01024  *  @update  vidur 11/12/98
01025  *  @param   aName -- string to init token name with
01026  *  @return  
01027  */
01028 CCDATASectionToken::CCDATASectionToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
01029   mTextValue.Assign(aName);
01030 }
01031 
01032 /*
01033  *  
01034  *  @update  vidur 11/12/98
01035  *  @param   
01036  *  @return  
01037  */
01038 PRInt32 CCDATASectionToken::GetTokenType(void) {
01039   return eToken_cdatasection;
01040 }
01041 
01042 /*
01043  *  Consume as much marked test from scanner as possible.
01044  *
01045  *  @update  rgess 12/15/99: had to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>".
01046  *  @param   aChar -- last char consumed from stream
01047  *  @param   aScanner -- controller of underlying input source
01048  *  @return  error result
01049  */
01050 nsresult CCDATASectionToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
01051   static const PRUnichar theTerminalsChars[] = 
01052   { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
01053   static const nsReadEndCondition theEndCondition(theTerminalsChars);
01054   nsresult  result=NS_OK;
01055   PRBool    done=PR_FALSE;
01056 
01057   while((NS_OK==result) && (!done)) {
01058     result=aScanner.ReadUntil(mTextValue,theEndCondition,PR_FALSE);
01059     if(NS_OK==result) {
01060       result=aScanner.Peek(aChar);
01061       if((kCR==aChar) && (NS_OK==result)) {
01062         result=aScanner.GetChar(aChar); //strip off the \r
01063         result=aScanner.Peek(aChar);    //then see what's next.
01064         if(NS_OK==result) {
01065           switch(aChar) {
01066             case kCR:
01067               result=aScanner.GetChar(aChar); //strip off the \r
01068               mTextValue.AppendLiteral("\n\n");
01069               mNewlineCount += 2;
01070               break;
01071             case kNewLine:
01072                //which means we saw \r\n, which becomes \n
01073               result=aScanner.GetChar(aChar); //strip off the \n
01074                   //now fall through on purpose...
01075             default:
01076               mTextValue.AppendLiteral("\n");
01077               mNewlineCount++;
01078               break;
01079           } //switch
01080         } //if
01081       }
01082       else if (kNewLine == aChar) {
01083         result=aScanner.GetChar(aChar);
01084         mTextValue.Append(aChar);
01085         ++mNewlineCount;
01086       }
01087       else if (kRightSquareBracket == aChar) {
01088         PRBool canClose = PR_FALSE;
01089         result=aScanner.GetChar(aChar); //strip off the ]
01090         mTextValue.Append(aChar);
01091         result=aScanner.Peek(aChar);    //then see what's next.
01092         if((NS_OK==result) && (kRightSquareBracket==aChar)) {
01093           result=aScanner.GetChar(aChar); //strip off the second ]
01094           mTextValue.Append(aChar);
01095           canClose = PR_TRUE;
01096         }
01097         // The goal here is to not lose data from the page when encountering
01098         // markup like: <![endif]-->.  This means that in normal parsing, we
01099         // allow ']' to end the marked section and just drop everything between
01100         // it an the '>'.  In view-source mode, we cannot drop things on the
01101         // floor like that.  In fact, to make view-source of XML with script in
01102         // CDATA sections at all bearable, we need to somewhat enforce the ']]>'
01103         // terminator for marked sections.  So make the tokenization somewhat
01104         // different when in view-source _and_ dealing with a CDATA section.
01105         PRBool inCDATA = (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) &&
01106           StringBeginsWith(mTextValue, NS_LITERAL_STRING("[CDATA["));
01107         if (inCDATA) {
01108           result = aScanner.Peek(aChar);
01109         } else {
01110           nsAutoString dummy; // skip any bad data
01111           result=aScanner.ReadUntil(dummy,kGreaterThan,PR_FALSE);
01112         }
01113         if (NS_OK==result &&
01114             (!inCDATA || (canClose && kGreaterThan == aChar))) {
01115           result=aScanner.GetChar(aChar); //strip off the >
01116           done=PR_TRUE;
01117         }
01118       }
01119       else done=PR_TRUE;
01120     }
01121   }
01122 
01123   if (kEOF == result && !aScanner.IsIncremental()) {
01124     // We ran out of space looking for the end of this CDATA section.
01125     // In order to not completely lose the entire section, treat everything
01126     // until the end of the document as part of the CDATA section and let
01127     // the DTD handle it.
01128     mInError = PR_TRUE;
01129     result = NS_OK;
01130   }
01131 
01132   return result;
01133 }
01134 
01135 const nsSubstring& CCDATASectionToken::GetStringValue(void)
01136 {
01137   return mTextValue;
01138 }
01139 
01140 
01141 /*
01142  *  default constructor
01143  *  
01144  *  @param   aName -- string to init token name with
01145  *  @return  
01146  */
01147 CMarkupDeclToken::CMarkupDeclToken() : CHTMLToken(eHTMLTag_markupDecl) {
01148 }
01149 
01150 
01151 /*
01152  *  string based constructor
01153  *  
01154  *  @param   aName -- string to init token name with
01155  *  @return  
01156  */
01157 CMarkupDeclToken::CMarkupDeclToken(const nsAString& aName) : CHTMLToken(eHTMLTag_markupDecl) {
01158   mTextValue.Rebind(aName);
01159 }
01160 
01161 
01162 /*
01163  *  
01164  *  @param   
01165  *  @return  
01166  */
01167 PRInt32 CMarkupDeclToken::GetTokenType(void) {
01168   return eToken_markupDecl;
01169 }
01170 
01171 /*
01172  *  Consume as much declaration from scanner as possible.
01173  *  Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
01174  *  NOTATION, which can span multiple lines and ends in >.
01175  *
01176  *  @param   aChar -- last char consumed from stream
01177  *  @param   aScanner -- controller of underlying input source
01178  *  @return  error result
01179  */
01180 nsresult CMarkupDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
01181   static const PRUnichar theTerminalsChars[] = 
01182     { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
01183       PRUnichar('>'),
01184       PRUnichar(0) };
01185   static const nsReadEndCondition theEndCondition(theTerminalsChars);
01186   nsresult  result=NS_OK;
01187   PRBool    done=PR_FALSE;
01188   PRUnichar quote=0;
01189 
01190   nsScannerIterator origin, start, end;
01191   aScanner.CurrentPosition(origin);
01192   start = origin;
01193 
01194   while((NS_OK==result) && (!done)) {
01195     aScanner.SetPosition(start);
01196     result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
01197     if(NS_OK==result) {
01198       result=aScanner.Peek(aChar);
01199 
01200       if(NS_OK==result) {
01201         PRUnichar theNextChar=0;
01202         if ((kCR==aChar) || (kNewLine==aChar)) {
01203           result=aScanner.GetChar(aChar); //strip off the char
01204           result=aScanner.Peek(theNextChar);    //then see what's next.
01205         }
01206         switch(aChar) {
01207           case kCR:
01208             // result=aScanner.GetChar(aChar);       
01209             if(kLF==theNextChar) {
01210               // If the "\r" is followed by a "\n", don't replace it and 
01211               // let it be ignored by the layout system
01212               end.advance(2);
01213               result=aScanner.GetChar(theNextChar);
01214             }
01215             else {
01216               // If it standalone, replace the "\r" with a "\n" so that 
01217               // it will be considered by the layout system
01218               aScanner.ReplaceCharacter(end, kLF);
01219               ++end;
01220             }
01221             ++mNewlineCount;
01222             break;
01223           case kLF:
01224             ++end;
01225             ++mNewlineCount;
01226             break;
01227           case '\'':
01228           case '"':
01229             ++end;
01230             if (quote) {
01231               if (quote == aChar) {
01232                 quote = 0;
01233               }
01234             } else {
01235               quote = aChar;
01236             }
01237             break;
01238           case kGreaterThan:
01239             if (quote) {
01240               ++end;
01241             } else {
01242               start = end;
01243               ++start;  // Note that start is wrong after this, we just avoid temp var
01244               aScanner.SetPosition(start); // Skip the >
01245               done=PR_TRUE;
01246             }
01247             break;
01248           default:
01249             NS_ABORT_IF_FALSE(0,"should not happen, switch is missing cases?");
01250             break;
01251         } //switch
01252         start = end;
01253       }
01254       else done=PR_TRUE;
01255     } // if read until !ok
01256   } // while
01257   
01258   aScanner.BindSubstring(mTextValue, origin, end);
01259 
01260   if (kEOF == result) {
01261     mInError = PR_TRUE;
01262     if (!aScanner.IsIncremental()) {
01263       // Hide this EOF.
01264       result = NS_OK;
01265     }
01266   }
01267 
01268   return result;
01269 }
01270 
01271 const nsSubstring& CMarkupDeclToken::GetStringValue(void)
01272 {
01273   return mTextValue.AsString();
01274 }
01275 
01276 
01277 /*
01278  *  Default constructor
01279  *  
01280  *  @update  gess 3/25/98
01281  *  @param   aName -- string to init token name with
01282  *  @return  
01283  */
01284 CCommentToken::CCommentToken() : CHTMLToken(eHTMLTag_comment) {
01285 }
01286 
01287 
01288 /*
01289  *  Copy constructor
01290  *  
01291  *  @update  gess 3/25/98
01292  *  @param   
01293  *  @return  
01294  */
01295 CCommentToken::CCommentToken(const nsAString& aName) : CHTMLToken(eHTMLTag_comment) {
01296   mComment.Rebind(aName);
01297 }
01298 
01299 void CCommentToken::AppendSourceTo(nsAString& anOutputString){
01300   AppendUnicodeTo(mCommentDecl, anOutputString);
01301 }
01302 
01303 static PRBool IsCommentEnd(
01304   const nsScannerIterator& aCurrent, 
01305   const nsScannerIterator& aEnd, 
01306   nsScannerIterator& aGt)
01307 {
01308   nsScannerIterator current = aCurrent;
01309   PRInt32 dashes = 0;
01310 
01311   while ((current != aEnd) && (dashes != 2)) {
01312     if (*current == kGreaterThan) {
01313       aGt = current;
01314       return PR_TRUE;
01315     }
01316     if (*current == PRUnichar('-')) {
01317       ++dashes;
01318     } else {
01319       dashes = 0;
01320     }
01321     ++current;
01322   }
01323 
01324   return PR_FALSE;
01325 }
01326 
01327 nsresult CCommentToken::ConsumeStrictComment(nsScanner& aScanner) 
01328 {
01329   // <!--[... -- ... -- ...]*-->
01330   /*********************************************************
01331     NOTE: This algorithm does a fine job of handling comments
01332           when they're formatted per spec, but if they're not
01333           we don't handle them well.
01334    *********************************************************/
01335   nsScannerIterator end, current, gt, lt;
01336   aScanner.EndReading(end);
01337   aScanner.CurrentPosition(current);
01338 
01339   nsScannerIterator beginData = end;
01340 
01341   lt = current;
01342   lt.advance(-2); // <!
01343 
01344   // Regular comment must start with <!--
01345   if (current != end && *current == kMinus &&
01346       ++current != end && *current == kMinus &&
01347       ++current != end) {
01348     nsScannerIterator currentEnd = end;
01349     PRBool balancedComment = PR_FALSE;
01350     static NS_NAMED_LITERAL_STRING(dashes,"--");
01351     beginData = current;
01352 
01353     while (FindInReadable(dashes, current, currentEnd)) {
01354       current.advance(2);
01355 
01356       balancedComment = !balancedComment; // We need to match '--' with '--'
01357     
01358       if (balancedComment && IsCommentEnd(current, end, gt)) {
01359         // done
01360         current.advance(-2);
01361         // Note: it's ok if beginData == current, (we'll copy an empty string)
01362         // and we need to bind mComment anyway.
01363         aScanner.BindSubstring(mComment, beginData, current);
01364         aScanner.BindSubstring(mCommentDecl, lt, ++gt);
01365         aScanner.SetPosition(gt);
01366         return NS_OK;
01367       } else {
01368         // Continue after the last '--'
01369         currentEnd = end;
01370       }
01371     }
01372   }
01373 
01374   // If beginData == end, we did not find opening '--'
01375   if (beginData == end) {
01376     // This might have been empty comment: <!>
01377     // Or it could have been something completely bogus like: <!This is foobar>
01378     // Handle both cases below
01379     aScanner.CurrentPosition(current);
01380     beginData = current;
01381     if (FindCharInReadable('>', current, end)) {
01382       aScanner.BindSubstring(mComment, beginData, current); 
01383       aScanner.BindSubstring(mCommentDecl, lt, ++current);
01384       aScanner.SetPosition(current);
01385       return NS_OK;
01386     }
01387   }
01388 
01389   if (aScanner.IsIncremental()) {
01390     // We got here because we saw the beginning of a comment,
01391     // but not yet the end, and we are still loading the page. In that
01392     // case the return value here will cause us to unwind,
01393     // wait for more content, and try again.
01394     // XXX For performance reasons we should cache where we were, and
01395     //     continue from there for next call
01396     return kEOF;
01397   }
01398 
01399   // There was no terminating string, parse this comment as text.
01400   aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
01401   return kNotAComment;
01402 }
01403 
01404 nsresult CCommentToken::ConsumeQuirksComment(nsScanner& aScanner) 
01405 {
01406   // <![-[-]] ... [[-]-|--!]>
01407   /*********************************************************
01408     NOTE: This algorithm does a fine job of handling comments
01409           commonly used, but it doesn't really consume them
01410           per spec (But then, neither does IE or Nav).
01411    *********************************************************/
01412   nsScannerIterator end, current;
01413   aScanner.EndReading(end);
01414   aScanner.CurrentPosition(current);
01415   nsScannerIterator beginData = current, 
01416                     beginLastMinus = end,
01417                     bestAltCommentEnd = end,
01418                     lt = current;
01419   lt.advance(-2); // <!
01420 
01421   // When we get here, we have always already consumed <!
01422   // Skip over possible leading minuses
01423   if (current != end && *current == kMinus) {
01424     beginLastMinus = current;
01425     ++current;
01426     ++beginData;
01427     if (current != end && *current == kMinus) { // <!--
01428       beginLastMinus = current;
01429       ++current;
01430       ++beginData;
01431       // Long form comment
01432 
01433       nsScannerIterator currentEnd = end, gt = end;
01434       
01435       // Find the end of the comment
01436       while (FindCharInReadable(kGreaterThan, current, currentEnd)) {
01437         gt = current;
01438         if (bestAltCommentEnd == end) {
01439           bestAltCommentEnd = gt;
01440         }
01441         --current;
01442         PRBool goodComment = PR_FALSE;
01443         if (current != beginLastMinus && *current == kMinus) { // ->
01444           --current;
01445           if (current != beginLastMinus && *current == kMinus) { // -->
01446             goodComment = PR_TRUE;
01447             --current;
01448           }
01449         } else if (current != beginLastMinus && *current == '!') {
01450           --current;
01451           if (current != beginLastMinus && *current == kMinus) {
01452             --current;
01453             if (current != beginLastMinus && *current == kMinus) { // --!>
01454               --current;
01455               goodComment = PR_TRUE;
01456             }
01457           }
01458         } else if (current == beginLastMinus) {
01459           goodComment = PR_TRUE;
01460         }
01461     
01462         if (goodComment) {
01463           // done
01464           aScanner.BindSubstring(mComment, beginData, ++current);
01465           aScanner.BindSubstring(mCommentDecl, lt, ++gt);
01466           aScanner.SetPosition(gt);
01467           return NS_OK;
01468         } else {
01469           // try again starting after the last '>'
01470           current = ++gt;
01471           currentEnd = end;
01472         }
01473       } //while
01474   
01475       if (aScanner.IsIncremental()) {
01476         // We got here because we saw the beginning of a comment,
01477         // but not yet the end, and we are still loading the page. In that
01478         // case the return value here will cause us to unwind,
01479         // wait for more content, and try again.
01480         // XXX For performance reasons we should cache where we were, and
01481         //     continue from there for next call
01482         return kEOF;
01483       }
01484 
01485       // If you're here, then we're in a special state. 
01486       // The problem at hand is that we've hit the end of the document without finding the normal endcomment delimiter "-->".
01487       // In this case, the first thing we try is to see if we found an alternate endcomment delimiter ">".
01488       // If so, rewind just pass that, and use everything up to that point as your comment.
01489       // If not, the document has no end comment and should be treated as one big comment.
01490       gt = bestAltCommentEnd;
01491       aScanner.BindSubstring(mComment, beginData, gt);
01492       if (gt != end) {
01493         ++gt;
01494       }
01495       aScanner.BindSubstring(mCommentDecl, lt, gt);
01496       aScanner.SetPosition(gt);
01497       return NS_OK;
01498     }
01499   }
01500   
01501   // This could be short form of comment
01502   // Find the end of the comment
01503   current = beginData;
01504   if (FindCharInReadable(kGreaterThan, current, end)) {
01505     nsScannerIterator gt = current;
01506     if (current != beginData) {
01507       --current;
01508       if (current != beginData && *current == kMinus) { // ->
01509         --current;
01510         if (current != beginData && *current == kMinus) { // -->
01511           --current;
01512         }
01513       } else if (current != beginData && *current == '!') { // !>
01514         --current;
01515         if (current != beginData && *current == kMinus) { // -!>
01516           --current;
01517           if (current != beginData && *current == kMinus) { // --!>
01518             --current;
01519           }
01520         }
01521       }
01522     }
01523 
01524     if (current != gt) {
01525       aScanner.BindSubstring(mComment, beginData, ++current);
01526     }
01527     else {
01528       // Bind mComment to an empty string (note that if current == gt,
01529       // then current == beginData). We reach this for <!>
01530       aScanner.BindSubstring(mComment, beginData, current);
01531     }
01532     aScanner.BindSubstring(mCommentDecl, lt, ++gt);
01533     aScanner.SetPosition(gt);
01534     return NS_OK;
01535   }
01536 
01537   if (!aScanner.IsIncremental()) {
01538     // This isn't a comment at all, go back to the < and consume as text.
01539     aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
01540     return kNotAComment;
01541   }
01542 
01543   // Wait for more data...
01544   return kEOF;
01545 }
01546 
01547 /*
01548  *  Consume the identifier portion of the comment. 
01549  *  Note that we've already eaten the "<!" portion.
01550  *  
01551  *  @update  gess 16June2000
01552  *  @param   aChar -- last char consumed from stream
01553  *  @param   aScanner -- controller of underlying input source
01554  *  @return  error result
01555  */
01556 nsresult CCommentToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
01557   nsresult result=PR_TRUE;
01558   
01559   if (aFlag & NS_IPARSER_FLAG_STRICT_MODE) {
01560     //Enabling strict comment parsing for Bug 53011 and  2749 contradicts!!!!
01561     result = ConsumeStrictComment(aScanner);
01562   }
01563   else {
01564     result = ConsumeQuirksComment(aScanner);
01565   }
01566 
01567   if (NS_SUCCEEDED(result)) {
01568     mNewlineCount = !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) ? mCommentDecl.CountChar(kNewLine) : -1;
01569   }
01570 
01571   return result;
01572 }
01573 
01574 const nsSubstring& CCommentToken::GetStringValue(void)
01575 {
01576   return mComment.AsString();
01577 }
01578 
01579 /*
01580  *  
01581  *  
01582  *  @update  gess 3/25/98
01583  *  @param   
01584  *  @return  
01585  */
01586 PRInt32 CCommentToken::GetTokenType(void) {
01587   return eToken_comment;
01588 }
01589 
01590 /*
01591  *  default constructor
01592  *  
01593  *  @update  gess 3/25/98
01594  *  @param   aName -- string to init token name with
01595  *  @return  
01596  */
01597 CNewlineToken::CNewlineToken() : CHTMLToken(eHTMLTag_newline) {
01598 }
01599 
01600 /*
01601  *  
01602  *  
01603  *  @update  gess 3/25/98
01604  *  @param   
01605  *  @return  
01606  */
01607 PRInt32 CNewlineToken::GetTokenType(void) {
01608   return eToken_newline;
01609 }
01610 
01611 
01612 static nsScannerSubstring* gNewlineStr;
01613 void CNewlineToken::AllocNewline()
01614 {
01615   gNewlineStr = new nsScannerSubstring(NS_LITERAL_STRING("\n"));
01616 }
01617 
01618 void CNewlineToken::FreeNewline()
01619 {
01620   if (gNewlineStr) {
01621     delete gNewlineStr;
01622     gNewlineStr = nsnull;
01623   }
01624 }
01625 
01632 const nsSubstring& CNewlineToken::GetStringValue(void) {
01633   return gNewlineStr->AsString();
01634 }
01635 
01636 /*
01637  *  Consume as many cr/lf pairs as you can find.
01638  *  
01639  *  @update  gess 3/25/98
01640  *  @param   aChar -- last char consumed from stream
01641  *  @param   aScanner -- controller of underlying input source
01642  *  @return  error result
01643  */
01644 nsresult CNewlineToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
01645 
01646 /*******************************************************************
01647 
01648   Here's what the HTML spec says about newlines:
01649 
01650   "A line break is defined to be a carriage return (&#x000D;), 
01651    a line feed (&#x000A;), or a carriage return/line feed pair. 
01652    All line breaks constitute white space."
01653 
01654  *******************************************************************/
01655 
01656   PRUnichar theChar;
01657   nsresult result=aScanner.Peek(theChar);
01658 
01659   if(NS_OK==result) {
01660     switch(aChar) {
01661       case kNewLine:
01662         if(kCR==theChar) {
01663           result=aScanner.GetChar(theChar);
01664         }
01665         break;
01666       case kCR: 
01667           //convert CRLF into just CR
01668         if(kNewLine==theChar) {
01669           result=aScanner.GetChar(theChar);
01670         }
01671         break;
01672       default:
01673         break;
01674     }
01675   }
01676 
01677   if (result == kEOF && !aScanner.IsIncremental()) {
01678     // Make sure we don't lose information about this trailing newline.
01679     result = NS_OK;
01680   }
01681 
01682   mNewlineCount = 1;
01683   return result;
01684 }
01685 
01686 /*
01687  *  default constructor
01688  *  
01689  *  @update  gess 3/25/98
01690  *  @param   aName -- string to init token name with
01691  *  @return  
01692  */
01693 CAttributeToken::CAttributeToken() : CHTMLToken(eHTMLTag_unknown) {
01694   mHasEqualWithoutValue=PR_FALSE;
01695 #ifdef DEBUG
01696   mLastAttribute = PR_FALSE;
01697 #endif
01698 }
01699 
01700 /*
01701  *  string based constructor
01702  *  
01703  *  @update  gess 3/25/98
01704  *  @param   aName -- string value to init token name with
01705  *  @return  
01706  */
01707 CAttributeToken::CAttributeToken(const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
01708   mTextValue.writable().Assign(aName);
01709   mHasEqualWithoutValue=PR_FALSE;
01710 #ifdef DEBUG
01711   mLastAttribute = PR_FALSE;
01712 #endif
01713 }
01714 
01715 /*
01716  *  construct initializing data to 
01717  *  key value pair
01718  *  
01719  *  @update  gess 3/25/98
01720  *  @param   aName -- string value to init token name with
01721  *  @return  
01722  */
01723 CAttributeToken::CAttributeToken(const nsAString& aKey, const nsAString& aName) : CHTMLToken(eHTMLTag_unknown) {
01724   mTextValue.writable().Assign(aName);
01725   mTextKey.Rebind(aKey);
01726   mHasEqualWithoutValue=PR_FALSE;
01727 #ifdef DEBUG
01728   mLastAttribute = PR_FALSE;
01729 #endif
01730 }
01731 
01732 /*
01733  *  
01734  *  
01735  *  @update  gess 3/25/98
01736  *  @param   
01737  *  @return  
01738  */
01739 PRInt32 CAttributeToken::GetTokenType(void) {
01740   return eToken_attribute;
01741 }
01742 
01743 const nsSubstring& CAttributeToken::GetStringValue(void)
01744 {
01745   return mTextValue.str();
01746 }
01747  
01748 /*
01749  *  
01750  *  
01751  *  @update  rickg  6June2000
01752  *  @param   anOutputString will recieve the result
01753  *  @return  nada
01754  */
01755 void CAttributeToken::GetSource(nsString& anOutputString){
01756   anOutputString.Truncate();
01757   AppendSourceTo(anOutputString);
01758 }
01759 
01760 /*
01761  *  
01762  *  
01763  *  @update  rickg  6June2000
01764  *  @param   result appended to the output string.
01765  *  @return  nada
01766  */
01767 void CAttributeToken::AppendSourceTo(nsAString& anOutputString){
01768   AppendUnicodeTo(mTextKey, anOutputString);
01769   if(mTextValue.str().Length() || mHasEqualWithoutValue) 
01770     anOutputString.AppendLiteral("=");
01771   anOutputString.Append(mTextValue.str());
01772   // anOutputString.AppendLiteral(";");
01773 }
01774 
01775 /*
01776  *  This general purpose method is used when you want to
01777  *  consume a known quoted string. 
01778  *  
01779  *  @param   aScanner -- controller of underlying input source
01780  *  @param   aTerminalChars -- characters that stop consuming attribute.
01781  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
01782  *  @return  error result
01783  */
01784 static
01785 nsresult ConsumeQuotedString(PRUnichar aChar,
01786                              nsScannerSharedSubstring& aString,
01787                              PRInt32& aNewlineCount,
01788                              nsScanner& aScanner,
01789                              PRInt32 aFlag)
01790 {
01791   NS_ASSERTION(aChar==kQuote || aChar==kApostrophe,"char is neither quote nor apostrophe");
01792   // hold onto this in case this is an unterminated string literal
01793   PRUint32 origLen = aString.str().Length();
01794 
01795   static const PRUnichar theTerminalCharsQuote[] = { 
01796     PRUnichar(kQuote), PRUnichar('&'), PRUnichar(kCR),
01797     PRUnichar(kNewLine), PRUnichar(0) };
01798   static const PRUnichar theTerminalCharsApostrophe[] = { 
01799     PRUnichar(kApostrophe), PRUnichar('&'), PRUnichar(kCR),
01800     PRUnichar(kNewLine), PRUnichar(0) };
01801   static const nsReadEndCondition
01802     theTerminateConditionQuote(theTerminalCharsQuote);
01803   static const nsReadEndCondition
01804     theTerminateConditionApostrophe(theTerminalCharsApostrophe);
01805 
01806   // Assume Quote to init to something
01807   const nsReadEndCondition *terminateCondition = &theTerminateConditionQuote;
01808   if (aChar==kApostrophe)
01809     terminateCondition = &theTerminateConditionApostrophe;
01810   
01811   nsresult result=NS_OK;
01812   nsScannerIterator theOffset;
01813   aScanner.CurrentPosition(theOffset);
01814 
01815   result=ConsumeUntil(aString,aNewlineCount,aScanner,
01816                       *terminateCondition,PR_TRUE,aFlag);
01817 
01818   if(NS_SUCCEEDED(result)) {
01819     result = aScanner.GetChar(aChar); // aChar should be " or '
01820   }
01821 
01822   // Ref: Bug 35806
01823   // A back up measure when disaster strikes...
01824   // Ex <table> <tr d="><td>hello</td></tr></table>
01825   if(!aString.str().IsEmpty() && aString.str().Last()!=aChar &&
01826      !aScanner.IsIncremental() && result==kEOF) {
01827     static const nsReadEndCondition
01828       theAttributeTerminator(kAttributeTerminalChars);
01829     aString.writable().Truncate(origLen);
01830     aScanner.SetPosition(theOffset, PR_FALSE, PR_TRUE);
01831     result=ConsumeUntil(aString,aNewlineCount,aScanner,
01832                         theAttributeTerminator,PR_FALSE,aFlag);
01833     if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
01834       // Remember that this string literal was unterminated.
01835       result = NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL;
01836     }
01837   }
01838   return result;
01839 }
01840 
01841 /*
01842  * This method is meant to be used by view-source to consume invalid attributes.
01843  * For the purposes of this method, an invalid attribute is an attribute that
01844  * starts with either ', ", or /. We consume all ', ", or / and the following whitespace.
01845  * 
01846  * @param aScanner -- the scanner we're reading our data from.
01847  * @param aChar -- the character we're skipping
01848  * @param aCurrent -- the current position that we're looking at.
01849  * @param aNewlineCount -- a count of the newlines we've consumed.
01850  * @return error result.
01851  */
01852 static
01853 nsresult ConsumeInvalidAttribute(nsScanner& aScanner,
01854                                  PRUnichar aChar,
01855                                  nsScannerIterator& aCurrent,
01856                                  PRInt32& aNewlineCount) {
01857   NS_ASSERTION(aChar == kApostrophe || aChar == kQuote || aChar == kForwardSlash,
01858                "aChar must be a quote or apostrophe");
01859   nsScannerIterator end, wsbeg;
01860   aScanner.EndReading(end);
01861 
01862   while (aCurrent != end && *aCurrent == aChar) {
01863     ++aCurrent;
01864   }
01865 
01866   aScanner.SetPosition(aCurrent);
01867   return aScanner.ReadWhitespace(wsbeg, aCurrent, aNewlineCount);
01868 }
01869 
01870 /*
01871  *  Consume the key and value portions of the attribute.
01872  *  
01873  *  @param   aChar -- last char consumed from stream
01874  *  @param   aScanner -- controller of underlying input source
01875  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
01876  *  @return  error result
01877  */
01878 nsresult CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
01879 {
01880   nsresult result;
01881   nsScannerIterator wsstart, wsend;
01882   
01883   if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
01884     result = aScanner.ReadWhitespace(wsstart, wsend, mNewlineCount);
01885     if (kEOF == result && wsstart != wsend) {
01886       // Do this here so if this is the final token in the document, we don't
01887       // lose the whitespace.
01888       aScanner.BindSubstring(mTextKey, wsstart, wsend);
01889     }
01890   }
01891   else {
01892     result = aScanner.SkipWhitespace(mNewlineCount);
01893   }
01894 
01895   if (NS_OK==result) {
01896     static const PRUnichar theTerminalsChars[] = 
01897     { PRUnichar(' '), PRUnichar('"'), 
01898       PRUnichar('='), PRUnichar('\n'), 
01899       PRUnichar('\r'), PRUnichar('\t'), 
01900       PRUnichar('>'), PRUnichar('<'),
01901       PRUnichar('\''), PRUnichar('/'),
01902       PRUnichar(0) };
01903     static const nsReadEndCondition theEndCondition(theTerminalsChars);
01904 
01905     nsScannerIterator start, end;
01906     result=aScanner.ReadUntil(start,end,theEndCondition,PR_FALSE);
01907 
01908     if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
01909       aScanner.BindSubstring(mTextKey, start, end);
01910     } 
01911     else if (kEOF == result && wsstart != end) {
01912       //Capture all of the text (from the beginning of the whitespace to the
01913       //end of the document).
01914       aScanner.BindSubstring(mTextKey, wsstart, end);
01915     }
01916 
01917     //now it's time to Consume the (optional) value...
01918     if (NS_OK==result) {
01919       if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
01920         result = aScanner.ReadWhitespace(start, wsend, mNewlineCount);
01921         aScanner.BindSubstring(mTextKey, wsstart, wsend);
01922       }
01923       else {
01924         result = aScanner.SkipWhitespace(mNewlineCount);
01925       }
01926 
01927       if (NS_OK==result) { 
01928         result=aScanner.Peek(aChar);       //Skip ahead until you find an equal sign or a '>'...
01929         if (NS_OK==result) {  
01930           if (kEqual==aChar){
01931             result=aScanner.GetChar(aChar);  //skip the equal sign...
01932             if (NS_OK==result) {
01933               if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
01934                 PRBool haveCR;
01935                 result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
01936                                                  haveCR);
01937               }
01938               else {
01939                 result = aScanner.SkipWhitespace(mNewlineCount);
01940               }
01941 
01942               if (NS_OK==result) {
01943                 result=aScanner.Peek(aChar);  //and grab the next char.    
01944                 if (NS_OK==result) {
01945                   if ((kQuote==aChar) || (kApostrophe==aChar)) {
01946                     aScanner.GetChar(aChar);
01947                     if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
01948                       mTextValue.writable().Append(aChar);
01949                     }
01950                     
01951                     result=ConsumeQuotedString(aChar,mTextValue,mNewlineCount,
01952                                                aScanner,aFlag);
01953                     if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
01954                       mTextValue.writable().Append(aChar);
01955                     } else if (result == NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL) {
01956                       result = NS_OK;
01957                       mInError = PR_TRUE;
01958                     }
01959                     // According to spec. we ( who? ) should ignore linefeeds. But look,
01960                     // even the carriage return was getting stripped ( wonder why! ) -
01961                     // Ref. to bug 15204.  Okay, so the spec. told us to ignore linefeeds,
01962                     // bug then what about bug 47535 ? Should we preserve everything then?
01963                     // Well, let's make it so! Commenting out the next two lines..
01964                     /*if(!aRetain)
01965                       mTextValue.StripChars("\r\n"); //per the HTML spec, ignore linefeeds...
01966                     */
01967                   }
01968                   else if (kGreaterThan==aChar){      
01969                     mHasEqualWithoutValue=PR_TRUE;
01970                     mInError=PR_TRUE;
01971                   }
01972                   else {
01973                     static const nsReadEndCondition
01974                       theAttributeTerminator(kAttributeTerminalChars);
01975                     result=ConsumeUntil(mTextValue,
01976                                         mNewlineCount,
01977                                         aScanner,
01978                                         theAttributeTerminator,
01979                                         PR_FALSE,
01980                                         aFlag);
01981                   } 
01982                 }//if
01983                 if (NS_OK==result) {
01984                   if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
01985                     PRBool haveCR;
01986                     result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
01987                                                      haveCR);
01988                   }
01989                   else {
01990                     result = aScanner.SkipWhitespace(mNewlineCount);
01991                   }
01992                 }
01993               }//if
01994               else {
01995                 //We saw an equal sign but ran out of room looking for a value.
01996                 mHasEqualWithoutValue=PR_TRUE;
01997                 mInError=PR_TRUE;
01998               }
01999             }//if
02000           }//if
02001           else {
02002             //This is where we have to handle fairly busted content.
02003             //If you're here, it means we saw an attribute name, but couldn't find 
02004             //the following equal sign.  <tag NAME....
02005         
02006             //Doing this right in all cases is <i>REALLY</i> ugly. 
02007             //My best guess is to grab the next non-ws char. We know it's not '=',
02008             //so let's see what it is. If it's a '"', then assume we're reading
02009             //from the middle of the value. Try stripping the quote and continuing...
02010             //Note that this code also strips forward slashes to handle cases
02011             //like <tag NAME/>
02012             if (kQuote == aChar || kApostrophe == aChar ||
02013                 kForwardSlash == aChar) {
02014               // In XML, a trailing slash isn't an error.
02015               if (kForwardSlash != aChar || !(aFlag & NS_IPARSER_FLAG_XML)) {
02016                 mInError = PR_TRUE;
02017               }
02018 
02019               if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
02020                 result = aScanner.SkipOver(aChar); // Strip quote or slash.
02021                 if (NS_SUCCEEDED(result)) {
02022                   result = aScanner.SkipWhitespace(mNewlineCount);
02023                 }
02024               } else {
02025                 //We want to collect whitespace here so that following 
02026                 //attributes can have the right line number (and for
02027                 //parity with the non-view-source code above).
02028                 result = ConsumeInvalidAttribute(aScanner, aChar, wsend, mNewlineCount);
02029 
02030                 aScanner.BindSubstring(mTextKey, wsstart, wsend);
02031                 aScanner.SetPosition(wsend);
02032               } 
02033             }
02034           }
02035         }//if
02036       } //if
02037     }//if (consume optional value)
02038 
02039     if (NS_OK==result) {
02040       if (mTextValue.str().Length() == 0 && mTextKey.Length() == 0 && 
02041           mNewlineCount == 0) {
02042         //This attribute contains no useful information for us, so there is no
02043         //use in keeping it around. Attributes that are otherwise empty, but
02044         //have newlines in them are passed on the the DTD so it can get line
02045         //numbering right.
02046         return NS_ERROR_HTMLPARSER_BADATTRIBUTE;
02047       }
02048 
02049 #ifdef DEBUG
02050       result = aScanner.Peek(aChar);
02051       mLastAttribute = (kGreaterThan == aChar || kEOF == result);
02052 #endif
02053     }
02054   }//if
02055 
02056   if (kEOF == result && !aScanner.IsIncremental()) {
02057     // This is our run-of-the mill "don't lose content at the end of a 
02058     // document" with a slight twist: we don't want to bother returning an
02059     // empty attribute key, even if this is the end of the document.
02060     if (mTextKey.Length() == 0) {
02061       result = NS_ERROR_HTMLPARSER_BADATTRIBUTE;
02062     }
02063     else {
02064       result = NS_OK;
02065     }
02066   }
02067 
02068   return result;
02069 }
02070 
02071 void CAttributeToken::SetKey(const nsAString& aKey)
02072 {
02073   mTextKey.Rebind(aKey);
02074 }
02075 
02076 void CAttributeToken::BindKey(nsScanner* aScanner, 
02077                               nsScannerIterator& aStart, 
02078                               nsScannerIterator& aEnd)
02079 {
02080   aScanner->BindSubstring(mTextKey, aStart, aEnd);
02081 }
02082 
02083 /*
02084  *  default constructor
02085  *  
02086  *  @update  gess 3/25/98
02087  *  @param   aName -- string to init token name with
02088  *  @return  
02089  */
02090 CWhitespaceToken::CWhitespaceToken() : CHTMLToken(eHTMLTag_whitespace) {
02091 }
02092 
02093 
02094 /*
02095  *  default constructor
02096  *  
02097  *  @update  gess 3/25/98
02098  *  @param   aName -- string value to init token name with
02099  *  @return  
02100  */
02101 CWhitespaceToken::CWhitespaceToken(const nsAString& aName) : CHTMLToken(eHTMLTag_whitespace) {
02102   mTextValue.writable().Assign(aName);
02103 }
02104 
02105 /*
02106  *  
02107  *  
02108  *  @update  gess 3/25/98
02109  *  @param   
02110  *  @return  
02111  */
02112 PRInt32 CWhitespaceToken::GetTokenType(void) {
02113   return eToken_whitespace;
02114 }
02115 
02116 /*
02117  *  This general purpose method is used when you want to
02118  *  consume an aribrary sequence of whitespace. 
02119  *  
02120  *  @update  gess 3/25/98
02121  *  @param   aChar -- last char consumed from stream
02122  *  @param   aScanner -- controller of underlying input source
02123  *  @return  error result
02124  */
02125 nsresult CWhitespaceToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
02126   // If possible, we'd like to just be a dependent substring starting at
02127   // |aChar|.  The scanner has already been advanced, so we need to
02128   // back it up to facilitate this.
02129 
02130   nsScannerIterator start;
02131   aScanner.CurrentPosition(start);
02132   aScanner.SetPosition(--start, PR_FALSE, PR_TRUE);
02133 
02134   PRBool haveCR;
02135 
02136   nsresult result = aScanner.ReadWhitespace(mTextValue, mNewlineCount, haveCR);
02137   
02138   if (result == kEOF && !aScanner.IsIncremental()) {
02139     // Oops, we ran off the end, make sure we don't lose the trailing
02140     // whitespace!
02141     result = NS_OK;
02142   }
02143 
02144   if (NS_OK == result && haveCR) {
02145     mTextValue.writable().StripChar(kCR);
02146   }
02147   return result;
02148 }
02149 
02150 const nsSubstring& CWhitespaceToken::GetStringValue(void)
02151 {
02152   return mTextValue.str();
02153 }
02154 
02155 /*
02156  *  default constructor
02157  *  
02158  *  @update  gess 3/25/98
02159  *  @param   aName -- string to init token name with
02160  *  @return  
02161  */
02162 CEntityToken::CEntityToken() : CHTMLToken(eHTMLTag_entity) {
02163 }
02164 
02165 /*
02166  *  default constructor
02167  *  
02168  *  @update  gess 3/25/98
02169  *  @param   aName -- string value to init token name with
02170  *  @return  
02171  */
02172 CEntityToken::CEntityToken(const nsAString& aName) : CHTMLToken(eHTMLTag_entity) {
02173   mTextValue.Assign(aName);
02174 #ifdef VERBOSE_DEBUG
02175   if(!VerifyEntityTable())  {
02176     cout<<"Entity table is invalid!" << endl;
02177   }
02178 #endif
02179 }
02180 
02181 
02182 /*
02183  *  Consume the rest of the entity. We've already eaten the "&".
02184  *  
02185  *  @update  gess 3/25/98
02186  *  @param   aChar -- last char consumed from stream
02187  *  @param   aScanner -- controller of underlying input source
02188  *  @return  error result
02189  */
02190 nsresult CEntityToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
02191   nsresult result=ConsumeEntity(aChar,mTextValue,aScanner);
02192   return result;
02193 }
02194 
02195 /*
02196  *  
02197  *  
02198  *  @update  gess 3/25/98
02199  *  @param   
02200  *  @return  
02201  */
02202 PRInt32 CEntityToken::GetTokenType(void) {
02203   return eToken_entity;
02204 }
02205 
02206 /*
02207  *  This general purpose method is used when you want to
02208  *  consume an entity &xxxx;. Keep in mind that entities
02209  *  are <i>not</i> reduced inline.
02210  *  
02211  *  @update  gess 3/25/98
02212  *  @param   aChar -- last char consumed from stream
02213  *  @param   aScanner -- controller of underlying input source
02214  *  @return  error result
02215  */
02216 nsresult
02217 CEntityToken::ConsumeEntity(PRUnichar aChar,
02218                             nsString& aString,
02219                             nsScanner& aScanner) {
02220   nsresult result=NS_OK;
02221   if(kLeftBrace==aChar) {
02222     //you're consuming a script entity...
02223     aScanner.GetChar(aChar); // Consume &
02224 
02225     PRInt32 rightBraceCount = 0;
02226     PRInt32 leftBraceCount  = 0;
02227 
02228     do {
02229       result=aScanner.GetChar(aChar);
02230       
02231       if (NS_FAILED(result)) {
02232         return result;
02233       }
02234 
02235       aString.Append(aChar);
02236       if(aChar==kRightBrace)
02237         ++rightBraceCount;
02238       else if(aChar==kLeftBrace)
02239         ++leftBraceCount;
02240     } while(leftBraceCount!=rightBraceCount);
02241   } //if
02242   else {
02243     PRUnichar theChar=0;
02244     if (kHashsign==aChar) {
02245       result = aScanner.Peek(theChar,2);
02246        
02247       if (NS_FAILED(result)) {
02248         if (kEOF == result && !aScanner.IsIncremental()) {
02249           // If this is the last buffer then we are certainly
02250           // not dealing with an entity. That's, there are
02251           // no more characters after &#. Bug 188278.
02252           return NS_HTMLTOKENS_NOT_AN_ENTITY;
02253         }
02254         return result;
02255       }
02256 
02257       if (nsCRT::IsAsciiDigit(theChar)) {
02258         aScanner.GetChar(aChar); // Consume &
02259         aScanner.GetChar(aChar); // Consume #
02260         aString.Assign(aChar);
02261         result=aScanner.ReadNumber(aString,10);
02262       }
02263       else if (theChar == 'x' || theChar == 'X') {
02264         aScanner.GetChar(aChar);   // Consume &
02265         aScanner.GetChar(aChar);   // Consume #
02266         aScanner.GetChar(theChar); // Consume x
02267         aString.Assign(aChar);
02268         aString.Append(theChar); 
02269         result=aScanner.ReadNumber(aString,16);
02270       }
02271       else {
02272         return NS_HTMLTOKENS_NOT_AN_ENTITY; 
02273       }
02274     }
02275     else {
02276       result = aScanner.Peek(theChar,1);
02277        
02278       if (NS_FAILED(result)) {
02279         return result;
02280       }
02281 
02282       if(nsCRT::IsAsciiAlpha(theChar) || 
02283         theChar == '_' ||
02284         theChar == ':') {
02285         aScanner.GetChar(aChar); // Consume &
02286         result=aScanner.ReadEntityIdentifier(aString);
02287       }
02288       else {
02289         return NS_HTMLTOKENS_NOT_AN_ENTITY;
02290       }
02291     }
02292   }
02293     
02294   if (NS_FAILED(result)) {
02295     return result;
02296   }
02297     
02298   result=aScanner.Peek(aChar);
02299   
02300   if (NS_FAILED(result)) {
02301     return result;
02302   }
02303 
02304   if (aChar == kSemicolon) {
02305     // consume semicolon that stopped the scan
02306     aString.Append(aChar);
02307     result=aScanner.GetChar(aChar);
02308   }
02309   
02310   return result;
02311 }
02312 
02313 #define PA_REMAP_128_TO_160_ILLEGAL_NCR 1
02314 
02315 #ifdef PA_REMAP_128_TO_160_ILLEGAL_NCR
02316 
02320 #define NOT_USED 0xfffd
02321 
02322 static const PRUint16 PA_HackTable[] = {
02323        0x20ac,  /* EURO SIGN */
02324        NOT_USED,
02325        0x201a,  /* SINGLE LOW-9 QUOTATION MARK */
02326        0x0192,  /* LATIN SMALL LETTER F WITH HOOK */
02327        0x201e,  /* DOUBLE LOW-9 QUOTATION MARK */
02328        0x2026,  /* HORIZONTAL ELLIPSIS */
02329        0x2020,  /* DAGGER */
02330        0x2021,  /* DOUBLE DAGGER */
02331        0x02c6,  /* MODIFIER LETTER CIRCUMFLEX ACCENT */
02332        0x2030,  /* PER MILLE SIGN */
02333        0x0160,  /* LATIN CAPITAL LETTER S WITH CARON */
02334        0x2039,  /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
02335        0x0152,  /* LATIN CAPITAL LIGATURE OE */
02336        NOT_USED,
02337        0x017D,  /* LATIN CAPITAL LETTER Z WITH CARON */
02338        NOT_USED,
02339        NOT_USED,
02340        0x2018,  /* LEFT SINGLE QUOTATION MARK */
02341        0x2019,  /* RIGHT SINGLE QUOTATION MARK */
02342        0x201c,  /* LEFT DOUBLE QUOTATION MARK */
02343        0x201d,  /* RIGHT DOUBLE QUOTATION MARK */
02344        0x2022,  /* BULLET */
02345        0x2013,  /* EN DASH */
02346        0x2014,  /* EM DASH */
02347        0x02dc,  /* SMALL TILDE */
02348        0x2122,  /* TRADE MARK SIGN */
02349        0x0161,  /* LATIN SMALL LETTER S WITH CARON */
02350        0x203a,  /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
02351        0x0153,  /* LATIN SMALL LIGATURE OE */
02352        NOT_USED,
02353        0x017E,  /* LATIN SMALL LETTER Z WITH CARON */
02354        0x0178   /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
02355 };
02356 #endif /* PA_REMAP_128_TO_160_ILLEGAL_NCR */
02357 
02358 static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue)
02359 {
02360 #ifdef PA_REMAP_128_TO_160_ILLEGAL_NCR
02361   /* for some illegal, but popular usage */
02362   if ((aNCRValue >= 0x0080) && (aNCRValue <= 0x009f)) {
02363     aNCRValue = PA_HackTable[aNCRValue - 0x0080];
02364   }
02365 #endif
02366 
02367   AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString);
02368 }
02369 
02370 /*
02371  *  This method converts this entity into its underlying
02372  *  unicode equivalent.
02373  *  
02374  *  @update  gess 3/25/98
02375  *  @param   aString will hold the resulting string value
02376  *  @return  numeric (unichar) value
02377  */
02378 PRInt32 CEntityToken::TranslateToUnicodeStr(nsString& aString) {
02379   PRInt32 value=0;
02380 
02381   if(mTextValue.Length()>1) {
02382     PRUnichar theChar0=mTextValue.CharAt(0);
02383 
02384     if(kHashsign==theChar0) {
02385       PRInt32 err=0;
02386       
02387       value=mTextValue.ToInteger(&err,kAutoDetect);
02388 
02389       if(0==err) {
02390         AppendNCR(aString, value);
02391       }
02392     }
02393     else{
02394       value = nsHTMLEntities::EntityToUnicode(mTextValue);
02395       if(-1<value) {
02396         //we found a named entity...
02397         aString.Assign(PRUnichar(value));
02398       }
02399     }//else
02400   }//if
02401 
02402   return value;
02403 }
02404 
02405 
02406 const nsSubstring& CEntityToken::GetStringValue(void)
02407 {
02408   return mTextValue;
02409 }
02410 
02411 /*
02412  *  
02413  *  
02414  *  @update  gess 3/25/98
02415  *  @param   anOutputString will recieve the result
02416  *  @return  nada
02417  */
02418 void CEntityToken::GetSource(nsString& anOutputString){
02419   anOutputString.AppendLiteral("&");
02420   anOutputString+=mTextValue;
02421   //anOutputString+=";";
02422 }
02423 
02424 /*
02425  *  
02426  *  
02427  *  @update  harishd 03/23/00
02428  *  @param   result appended to the output string.
02429  *  @return  nada
02430  */
02431 void CEntityToken::AppendSourceTo(nsAString& anOutputString){
02432   anOutputString.AppendLiteral("&");
02433   anOutputString+=mTextValue;
02434   //anOutputString+=";";
02435 }
02436 
02443 const PRUnichar* GetTagName(PRInt32 aTag)
02444 {
02445   const PRUnichar *result = nsHTMLTags::GetStringValue((nsHTMLTag) aTag);
02446 
02447   if (result) {
02448     return result;
02449   }
02450 
02451   if(aTag >= eHTMLTag_userdefined)
02452     return sUserdefined;
02453 
02454   return 0;
02455 }
02456 
02457 
02465 CInstructionToken::CInstructionToken() : CHTMLToken(eHTMLTag_instruction) {
02466 }
02467 
02475 CInstructionToken::CInstructionToken(const nsAString& aString) : CHTMLToken(eHTMLTag_unknown) {
02476   mTextValue.Assign(aString);
02477 }
02478 
02486 nsresult CInstructionToken::Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aFlag){
02487   mTextValue.AssignLiteral("<?");
02488   nsresult result=NS_OK;
02489   PRBool done=PR_FALSE;
02490   
02491   while (NS_OK==result && !done) {
02492     //Note, this call does *not* consume the >.
02493     result=aScanner.ReadUntil(mTextValue,kGreaterThan,PR_FALSE);
02494     if (NS_SUCCEEDED(result)) {
02495       //In HTML, PIs end with a '>', in XML, they end with a '?>'. Cover both
02496       //cases here.
02497       if (!(aFlag & NS_IPARSER_FLAG_XML) || kQuestionMark==mTextValue.Last()) {
02498         //This really is the end of the PI.
02499         done=PR_TRUE;
02500       }
02501       //Need to append this character no matter what.
02502       aScanner.GetChar(aChar);
02503       mTextValue.Append(aChar);
02504     }
02505   }
02506 
02507   if (kEOF==result && !aScanner.IsIncremental()) {
02508     //Hide the EOF result because there is no more text coming.
02509     mInError=PR_TRUE;
02510     result=NS_OK;
02511   }
02512 
02513   return result;
02514 }
02515 
02523 PRInt32 CInstructionToken::GetTokenType(void){
02524   return eToken_instruction;
02525 }
02526 
02527 const nsSubstring& CInstructionToken::GetStringValue(void)
02528 {
02529   return mTextValue;
02530 }
02531 
02532 // Doctype decl token
02533 
02534 CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag)
02535   : CHTMLToken(aTag) {
02536 }
02537 
02538 CDoctypeDeclToken::CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag)
02539   : CHTMLToken(aTag), mTextValue(aString) {
02540 }
02541 
02550 nsresult CDoctypeDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,PRInt32 aFlag) {
02551     
02552   static const PRUnichar terminalChars[] = 
02553   { PRUnichar('>'), PRUnichar('<'),
02554     PRUnichar(0) 
02555   };
02556   static const nsReadEndCondition theEndCondition(terminalChars);
02557 
02558   nsScannerIterator start, end;
02559   
02560   aScanner.CurrentPosition(start);
02561   aScanner.EndReading(end);
02562 
02563   nsresult result=aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
02564 
02565   if (NS_SUCCEEDED(result)) {
02566     PRUnichar ch;
02567     aScanner.Peek(ch);
02568     if (ch == kGreaterThan) {
02569       // Include '>' but not '<' since '<' 
02570       // could belong to another tag.
02571       aScanner.GetChar(ch);
02572       end.advance(1); 
02573     } else {
02574       NS_ASSERTION(kLessThan == ch, 
02575                    "Make sure this doctype decl. is really in error.");
02576       mInError = PR_TRUE;
02577     }
02578   }
02579   else if (!aScanner.IsIncremental()) {
02580     // We have reached the document end but haven't
02581     // found either a '<' or a '>'. Therefore use
02582     // whatever we have.
02583     mInError = PR_TRUE;
02584     result = NS_OK; 
02585   }
02586   
02587   if (NS_SUCCEEDED(result)) {
02588     start.advance(-2); // Make sure to consume <!
02589     CopyUnicodeTo(start,end,mTextValue);
02590   }
02591   
02592   return result;
02593 }
02594 
02595 PRInt32 CDoctypeDeclToken::GetTokenType(void) {
02596   return eToken_doctypeDecl;
02597 }
02598 
02599 const nsSubstring& CDoctypeDeclToken::GetStringValue(void)
02600 {
02601   return mTextValue;
02602 }
02603 
02604 void CDoctypeDeclToken::SetStringValue(const nsAString& aStr)
02605 {
02606   mTextValue.Assign(aStr);
02607 }