Back to index

lightning-sunbird  0.9+nobinonly
nsParser.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* vim: set sw=2 ts=2 et tw=78: */
00003 /* ***** BEGIN LICENSE BLOCK *****
00004  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00005  *
00006  * The contents of this file are subject to the Mozilla Public License Version
00007  * 1.1 (the "License"); you may not use this file except in compliance with
00008  * the License. You may obtain a copy of the License at
00009  * http://www.mozilla.org/MPL/
00010  *
00011  * Software distributed under the License is distributed on an "AS IS" basis,
00012  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00013  * for the specific language governing rights and limitations under the
00014  * License.
00015  *
00016  * The Original Code is mozilla.org code.
00017  *
00018  * The Initial Developer of the Original Code is
00019  * Netscape Communications Corporation.
00020  * Portions created by the Initial Developer are Copyright (C) 1998
00021  * the Initial Developer. All Rights Reserved.
00022  *
00023  * Contributor(s):
00024  *   Pierre Phaneuf <pp@ludusdesign.com>
00025  *
00026  * Alternatively, the contents of this file may be used under the terms of
00027  * either of the GNU General Public License Version 2 or later (the "GPL"),
00028  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00029  * in which case the provisions of the GPL or the LGPL are applicable instead
00030  * of those above. If you wish to allow use of your version of this file only
00031  * under the terms of either the GPL or the LGPL, and not to allow others to
00032  * use your version of this file under the terms of the MPL, indicate your
00033  * decision by deleting the provisions above and replace them with the notice
00034  * and other provisions required by the GPL or the LGPL. If you do not delete
00035  * the provisions above, a recipient may use your version of this file under
00036  * the terms of any one of the MPL, the GPL or the LGPL.
00037  *
00038  * ***** END LICENSE BLOCK ***** */
00039   
00040 #include "nsIAtom.h"
00041 #include "nsParser.h"
00042 #include "nsString.h"
00043 #include "nsCRT.h" 
00044 #include "nsScanner.h"
00045 #include "plstr.h"
00046 #include "nsIStringStream.h"
00047 #include "nsIChannel.h"
00048 #include "nsICachingChannel.h"
00049 #include "nsICacheEntryDescriptor.h"
00050 #include "nsICharsetAlias.h"
00051 #include "nsIInputStream.h"
00052 #include "CNavDTD.h"
00053 #include "COtherDTD.h"
00054 #include "prenv.h" 
00055 #include "nsParserCIID.h"
00056 #include "nsReadableUtils.h"
00057 #include "nsCOMPtr.h"
00058 #include "nsIEventQueue.h"
00059 #include "nsIEventQueueService.h"
00060 #include "nsExpatDriver.h"
00061 #include "nsIServiceManager.h"
00062 #include "nsICategoryManager.h"
00063 #include "nsISupportsPrimitives.h"
00064 #include "nsIFragmentContentSink.h"
00065 #include "nsStreamUtils.h"
00066 
00067 #ifdef MOZ_VIEW_SOURCE
00068 #include "nsViewSourceHTML.h" 
00069 #endif
00070 
00071 #define NS_PARSER_FLAG_DTD_VERIFICATION       0x00000001
00072 #define NS_PARSER_FLAG_PARSER_ENABLED         0x00000002
00073 #define NS_PARSER_FLAG_OBSERVERS_ENABLED      0x00000004
00074 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
00075 #define NS_PARSER_FLAG_CAN_INTERRUPT          0x00000010
00076 #define NS_PARSER_FLAG_FLUSH_TOKENS           0x00000020
00077 #define NS_PARSER_FLAG_CAN_TOKENIZE           0x00000040
00078 
00079 static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);                 
00080 static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID); 
00081 static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
00082 
00083 static NS_DEFINE_CID(kEventQueueServiceCID, NS_EVENTQUEUESERVICE_CID);
00084 
00085 //-------------------------------------------------------------------
00086 
00087 nsCOMArray<nsIUnicharStreamListener> *nsParser::sParserDataListeners;
00088  
00089 
00090 class CDTDDeallocator: public nsDequeFunctor{
00091 public:
00092   virtual void* operator()(void* anObject) {
00093     nsIDTD* aDTD =(nsIDTD*)anObject;
00094     NS_RELEASE(aDTD);
00095     return 0;
00096   }
00097 };
00098 
00099 //-------------------------------------------------------------------
00100 
00101 class CDTDFinder: public nsDequeFunctor{
00102 public:
00103   CDTDFinder(nsIDTD* aDTD) {
00104     mTargetDTD=aDTD;
00105   }
00106   virtual ~CDTDFinder() {
00107   }
00108   virtual void* operator()(void* anObject) {
00109     nsIDTD* theDTD=(nsIDTD*)anObject;
00110     if(theDTD->GetMostDerivedIID().Equals(mTargetDTD->GetMostDerivedIID()))
00111       return anObject;
00112     return 0;
00113   }
00114   nsIDTD* mTargetDTD;
00115 };
00116 
00117 //-------------------------------------------------------------------
00118 
00119 class CSharedParserObjects {
00120 public:
00121 
00122   CSharedParserObjects()
00123   :mDTDDeque(0), 
00124    mHasViewSourceDTD(PR_FALSE),
00125    mHasXMLDTD(PR_FALSE) 
00126   {
00127     // do nothing.
00128   }
00129 
00130   ~CSharedParserObjects() {
00131     CDTDDeallocator theDeallocator;
00132     mDTDDeque.ForEach(theDeallocator);  //release all the DTD's
00133   }
00134 
00135   nsresult Init() {
00136     //Note: To cut down on startup time/overhead, we defer the construction of non-html DTD's. 
00137     nsIDTD* theDTD = 0;
00138     nsresult rv = NS_NewNavHTMLDTD(&theDTD);    //do this as a default HTML DTD...
00139     
00140     NS_ASSERTION(theDTD, "Failed to create DTD");
00141     NS_ENSURE_SUCCESS(rv, rv);
00142     
00143     mDTDDeque.Push(theDTD);
00144     mHasViewSourceDTD = PR_FALSE;
00145     mHasXMLDTD = PR_FALSE;
00146     return NS_OK;
00147   }
00148 
00149   nsresult RegisterDTD(nsIDTD* aDTD) {
00150     NS_ENSURE_ARG_POINTER(aDTD);
00151     nsCOMPtr<nsIDTD> dtd(aDTD);
00152     CDTDFinder theFinder(dtd);
00153     if (!mDTDDeque.FirstThat(theFinder)) {
00154       nsIDTD* theDTD;
00155       nsresult rv = dtd->CreateNewInstance(&theDTD);
00156       NS_ENSURE_SUCCESS(rv, rv);
00157       mDTDDeque.Push(theDTD);
00158     }
00159     return NS_OK;
00160   }
00161   
00162   nsDeque mDTDDeque;
00163   PRBool  mHasViewSourceDTD;  //this allows us to defer construction of this object.
00164   PRBool  mHasXMLDTD;         //also defer XML dtd construction
00165 };
00166 
00167 
00168 //-------------- Begin ParseContinue Event Definition ------------------------
00169 /*
00170 The parser can be explicitly interrupted by passing a return value of NS_ERROR_HTMLPARSER_INTERRUPTED
00171 from BuildModel on the DTD. This will cause the parser to stop processing and allow 
00172 the application to return to the event loop. The data which was left at the time of 
00173 interruption will be processed the next time OnDataAvailable is called. If the parser
00174 has received its final chunk of data then OnDataAvailable will no longer be called by the 
00175 networking module, so the parser will schedule a nsParserContinueEvent which will call 
00176 the parser to process the  remaining data after returning to the event loop. If the parser 
00177 is interrupted while processing the remaining data it will schedule another 
00178 ParseContinueEvent. The processing of data followed by scheduling of the continue events 
00179 will proceed until either:
00180 
00181   1) All of the remaining data can be processed without interrupting
00182   2) The parser has been cancelled.
00183 
00184 
00185 This capability is currently used in CNavDTD and nsHTMLContentSink. The nsHTMLContentSink is
00186 notified by CNavDTD when a chunk of tokens is going to be processed and when each token 
00187 is processed. The nsHTML content sink records the time when the chunk has started
00188 processing and will return NS_ERROR_HTMLPARSER_INTERRUPTED if the token processing time 
00189 has exceeded a threshold called max tokenizing processing time. This allows the content 
00190 sink to limit how much data is processed in a single chunk which in turn gates how much 
00191 time is spent away from the event loop. Processing smaller chunks of data also reduces 
00192 the time spent in subsequent reflows.
00193 
00194 This capability is most apparent when loading large documents. If the maximum token 
00195 processing time is set small enough the application will remain responsive during 
00196 document load. 
00197 
00198 A side-effect of this capability is that document load is not complete when the last chunk
00199 of data is passed to OnDataAvailable since  the parser may have been interrupted when 
00200 the last chunk of data arrived. The document is complete when all of the document has 
00201 been tokenized and there aren't any pending nsParserContinueEvents. This can cause 
00202 problems if the application assumes that it can monitor the load requests to determine
00203 when the document load has been completed. This is what happens in Mozilla. The document
00204 is considered completely loaded when all of the load requests have been satisfied. To delay the
00205 document load until all of the parsing has been completed the nsHTMLContentSink adds a 
00206 dummy parser load request which is not removed until the nsHTMLContentSink's DidBuildModel
00207 is called. The CNavDTD will not call DidBuildModel until the final chunk of data has been 
00208 passed to the parser through the OnDataAvailable and there aren't any pending 
00209 nsParserContineEvents.
00210 
00211 Currently the parser is ignores requests to be interrupted during the processing of script. 
00212 This is because a document.write followed by JavaScript calls to manipulate the DOM may 
00213 fail if the parser was interrupted during the document.write. 
00214 
00215 
00216 For more details @see bugzilla bug 76722
00217 */
00218 
00219 
00220 struct nsParserContinueEvent : public PLEvent {
00221 
00222   nsParserContinueEvent(nsParser* aParser)
00223   {
00224     NS_ADDREF(aParser); 
00225     PL_InitEvent(this, aParser, HandleEvent, DestroyEvent);  
00226   }
00227 
00228   ~nsParserContinueEvent()
00229   {
00230     nsParser *parser = (nsParser*) owner;
00231     NS_RELEASE(parser);
00232   }
00233 
00234   PR_STATIC_CALLBACK(void*) HandleEvent(PLEvent* aEvent)
00235   {
00236     nsParser *parser = (nsParser*) aEvent->owner;
00237     parser->HandleParserContinueEvent();
00238     return nsnull;
00239   }
00240 
00241   PR_STATIC_CALLBACK(void) DestroyEvent(PLEvent* aEvent)
00242   {
00243     delete (nsParserContinueEvent*) aEvent;
00244   }
00245 };
00246 
00247 //-------------- End ParseContinue Event Definition ------------------------
00248 
00249 
00250 static CSharedParserObjects* gSharedParserObjects=0;
00251 
00252 
00253 //-------------------------------------------------------------------------
00254 
00255 static nsresult
00256 GetSharedObjects(CSharedParserObjects** aSharedParserObjects) {
00257   if (!gSharedParserObjects) {
00258     gSharedParserObjects = new CSharedParserObjects();
00259     NS_ENSURE_TRUE(gSharedParserObjects, NS_ERROR_OUT_OF_MEMORY);
00260     nsresult rv = gSharedParserObjects->Init();
00261     NS_ENSURE_SUCCESS(rv, rv);
00262   }
00263   *aSharedParserObjects = gSharedParserObjects;
00264   return NS_OK;
00265 }
00266 
00267 static void
00268 FreeSharedObjects(void) {
00269   if (gSharedParserObjects) {
00270     delete gSharedParserObjects;
00271     gSharedParserObjects=0;
00272   }
00273 }
00274 
00275 
00279 // static
00280 nsresult nsParser::Init()
00281 {
00282   nsresult rv;
00283   nsCOMPtr<nsICategoryManager> cm =
00284     do_GetService(NS_CATEGORYMANAGER_CONTRACTID, &rv);
00285   NS_ENSURE_SUCCESS(rv, rv);
00286 
00287   nsCOMPtr<nsISimpleEnumerator> e;
00288   rv = cm->EnumerateCategory("Parser data listener", getter_AddRefs(e));
00289   NS_ENSURE_SUCCESS(rv, rv);
00290 
00291   nsCAutoString categoryEntry;
00292   nsXPIDLCString contractId;
00293   nsCOMPtr<nsISupports> entry;
00294 
00295   while (NS_SUCCEEDED(e->GetNext(getter_AddRefs(entry)))) {
00296     nsCOMPtr<nsISupportsCString> category(do_QueryInterface(entry));
00297 
00298     if (!category) {
00299       NS_WARNING("Category entry not an nsISupportsCString!");
00300 
00301       continue;
00302     }
00303 
00304     rv = category->GetData(categoryEntry);
00305     NS_ENSURE_SUCCESS(rv, rv);
00306 
00307     rv = cm->GetCategoryEntry("Parser data listener", categoryEntry.get(),
00308                               getter_Copies(contractId));
00309     NS_ENSURE_SUCCESS(rv, rv);
00310 
00311     nsCOMPtr<nsIUnicharStreamListener> listener =
00312       do_CreateInstance(contractId.get());
00313 
00314     if (listener) {
00315       if (!sParserDataListeners) {
00316         sParserDataListeners = new nsCOMArray<nsIUnicharStreamListener>();
00317 
00318         if (!sParserDataListeners)
00319           return NS_ERROR_OUT_OF_MEMORY;
00320       }
00321 
00322       sParserDataListeners->AppendObject(listener);
00323     }
00324   }
00325 
00326   return NS_OK;
00327 }
00328 
00329 
00335 // static
00336 void nsParser::Shutdown()
00337 {
00338   FreeSharedObjects();
00339 
00340   delete sParserDataListeners;
00341   sParserDataListeners = nsnull;
00342 }
00343 
00344 
00345 #ifdef DEBUG
00346 static PRBool gDumpContent=PR_FALSE;
00347 #endif
00348 
00356 nsParser::nsParser() {
00357 #ifdef NS_DEBUG
00358   if(!gDumpContent) {
00359     gDumpContent=(PR_GetEnv("PARSER_DUMP_CONTENT"))? PR_TRUE:PR_FALSE;
00360   }
00361 #endif
00362 
00363   mCharset.AssignLiteral("ISO-8859-1");
00364   mParserContext=0;
00365   mStreamStatus=0;
00366   mCharsetSource=kCharsetUninitialized;
00367   mInternalState=NS_OK;;
00368   mCommand=eViewNormal;
00369   mFlags = NS_PARSER_FLAG_OBSERVERS_ENABLED | NS_PARSER_FLAG_PARSER_ENABLED | NS_PARSER_FLAG_CAN_TOKENIZE;
00370  
00371   MOZ_TIMER_DEBUGLOG(("Reset: Parse Time: nsParser::nsParser(), this=%p\n", this));
00372   MOZ_TIMER_RESET(mParseTime);  
00373   MOZ_TIMER_RESET(mDTDTime);  
00374   MOZ_TIMER_RESET(mTokenizeTime);
00375 
00376   nsresult rv = NS_OK;
00377   if (mEventQueue == nsnull) {
00378     // Cache the event queue of the current UI thread
00379     nsCOMPtr<nsIEventQueueService> eventService = 
00380              do_GetService(kEventQueueServiceCID, &rv);
00381     if (NS_SUCCEEDED(rv) && (eventService)) {                  // XXX this implies that the UI is the current thread.
00382       rv = eventService->GetThreadEventQueue(NS_CURRENT_THREAD, getter_AddRefs(mEventQueue));
00383     }
00384 
00385    // NS_ASSERTION(mEventQueue, "event queue is null");
00386   }
00387 }
00388 
00396 nsParser::~nsParser() {
00397 
00398 #ifdef NS_DEBUG
00399   if(gDumpContent) {
00400     if(mSink) {
00401       // Sink ( HTMLContentSink at this time) supports nsIDebugDumpContent
00402       // interface. We can get to the content model through the sink.
00403       nsresult result=NS_OK;
00404       nsCOMPtr<nsIDebugDumpContent> trigger=do_QueryInterface(mSink,&result);
00405       if(NS_SUCCEEDED(result)) {
00406         trigger->DumpContentModel();
00407       }
00408     }
00409   }
00410 #endif
00411 
00412 #ifdef DEBUG
00413   if (mParserContext && mParserContext->mPrevContext) {
00414     NS_WARNING("Extra parser contexts still on the parser stack");
00415   }
00416 #endif
00417 
00418   while (mParserContext) {
00419     CParserContext *pc = mParserContext->mPrevContext;
00420     delete mParserContext;
00421     mParserContext = pc;
00422   }
00423 
00424   if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) {
00425     NS_ASSERTION(mEventQueue != nsnull,"Event queue is null"); 
00426     mEventQueue->RevokeEvents(this);
00427   }
00428 }
00429 
00430 
00431 NS_IMPL_ADDREF(nsParser)
00432 NS_IMPL_RELEASE(nsParser)
00433 
00434 
00445 nsresult nsParser::QueryInterface(const nsIID& aIID, void** aInstancePtr)  
00446 {                                                                       
00447   if (NULL == aInstancePtr) {                                            
00448     return NS_ERROR_NULL_POINTER;                                        
00449   }                                                                      
00450 
00451   if(aIID.Equals(kISupportsIID))    {  //do IUnknown...
00452     *aInstancePtr = (nsIParser*)(this);                                        
00453   }
00454   else if(aIID.Equals(kIParserIID)) {  //do IParser base class...
00455     *aInstancePtr = (nsIParser*)(this);                                        
00456   }
00457   else if(aIID.Equals(NS_GET_IID(nsIRequestObserver))) {
00458     *aInstancePtr = (nsIRequestObserver*)(this);                                        
00459   }
00460   else if(aIID.Equals(NS_GET_IID(nsIStreamListener))) {
00461     *aInstancePtr = (nsIStreamListener*)(this);                                        
00462   }
00463   else if(aIID.Equals(kCParserCID)) {  //do this class...
00464     *aInstancePtr = (nsParser*)(this);                                        
00465   }   
00466   else {
00467     *aInstancePtr=0;
00468     return NS_NOINTERFACE;
00469   }
00470   NS_ADDREF_THIS();
00471   return NS_OK;                                                        
00472 }
00473 
00474 // The parser continue event is posted only if
00475 // all of the data to parse has been passed to ::OnDataAvailable
00476 // and the parser has been interrupted by the content sink
00477 // because the processing of tokens took too long.
00478  
00479 nsresult
00480 nsParser::PostContinueEvent()
00481 {
00482   if (!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) && mEventQueue) {
00483     nsParserContinueEvent* ev = new nsParserContinueEvent(this);
00484     NS_ENSURE_TRUE(ev, NS_ERROR_OUT_OF_MEMORY);
00485     if (NS_FAILED(mEventQueue->PostEvent(ev))) {
00486         NS_ERROR("failed to post parser continuation event");
00487         PL_DestroyEvent(ev);
00488     }
00489     else {
00490         mFlags |= NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
00491     }
00492   }
00493   return NS_OK;
00494 }
00495 
00496 
00503 NS_IMETHODIMP_(void) nsParser::SetParserFilter(nsIParserFilter * aFilter)
00504 {
00505   mParserFilter = aFilter;
00506 }
00507 
00508 
00509 NS_IMETHODIMP_(void) nsParser::GetCommand(nsCString& aCommand)
00510 {
00511   aCommand = mCommandStr;  
00512 }
00513 
00522 NS_IMETHODIMP_(void) nsParser::SetCommand(const char* aCommand)
00523 {
00524   mCommandStr.Assign(aCommand);
00525   if(mCommandStr.Equals(kViewSourceCommand))
00526     mCommand=eViewSource;
00527   else if(mCommandStr.Equals(kViewFragmentCommand))
00528     mCommand=eViewFragment;
00529   else
00530     mCommand=eViewNormal;
00531 }
00532 
00541 NS_IMETHODIMP_(void) nsParser::SetCommand(eParserCommands aParserCommand)
00542 {
00543   mCommand = aParserCommand;
00544 }
00545 
00546 
00556 NS_IMETHODIMP_(void)
00557 nsParser::SetDocumentCharset(const nsACString& aCharset, PRInt32 aCharsetSource)
00558 {
00559   mCharset = aCharset;
00560   mCharsetSource = aCharsetSource; 
00561   if(mParserContext && mParserContext->mScanner)
00562      mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
00563 }
00564 
00565 void nsParser::SetSinkCharset(nsACString& aCharset)
00566 {
00567   if (mSink) {
00568     mSink->SetDocumentCharset(aCharset);
00569   }
00570 }
00571 
00580 NS_IMETHODIMP_(void) nsParser::SetContentSink(nsIContentSink* aSink)
00581 {
00582   NS_PRECONDITION(aSink,"sink cannot be null!");
00583   mSink = aSink;
00584   
00585   if (mSink) {
00586     mSink->SetParser(this);
00587   }
00588 }
00589 
00595 NS_IMETHODIMP_(nsIContentSink*) nsParser::GetContentSink(void)
00596 {
00597   return mSink;
00598 }
00599 
00608 NS_IMETHODIMP
00609 nsParser::RegisterDTD(nsIDTD* aDTD)
00610 {
00611   CSharedParserObjects* sharedObjects;
00612   nsresult rv = GetSharedObjects(&sharedObjects);
00613   NS_ENSURE_SUCCESS(rv, rv);
00614   return sharedObjects->RegisterDTD(aDTD);
00615 }
00616 
00623 NS_IMETHODIMP_(nsDTDMode) nsParser::GetParseMode(void)
00624 {
00625   if(mParserContext)
00626     return mParserContext->mDTDMode;
00627   NS_NOTREACHED("no parser context");
00628   return eDTDMode_unknown;
00629 }
00630 
00631 
00632 template <class CharT>
00633 class CWordTokenizer {
00634 public:
00635   CWordTokenizer(const CharT* aBuffer,PRInt32 aStartOffset,PRInt32 aMaxOffset) {
00636     mLength=0;
00637     mOffset=aStartOffset;
00638     mMaxOffset=aMaxOffset;
00639     mBuffer=aBuffer;
00640     mEndBuffer=mBuffer+mMaxOffset;
00641   }
00642 
00643   //********************************************************************************
00644   // Get offset of nth word in string.
00645   // We define words as: 
00646   //    1) sequence of alphanum; 
00647   //    2) quoted substring
00648   //    3) SGML comment -- ... -- 
00649   // Returns offset of nth word, or -1 (if out of words).
00650   //********************************************************************************
00651   
00652   PRInt32 GetNextWord(PRBool aSkipQuotes=PR_FALSE) {
00653     
00654     if(mOffset == kNotFound) {
00655       return kNotFound; // Ref. bug 89732
00656     }
00657 
00658     if (mOffset >= 0) {
00659       const CharT *cp=mBuffer+mOffset+mLength;  //skip last word
00660 
00661       mLength=0;  //reset this
00662       mOffset=-1; //reset this        
00663 
00664       //now skip whitespace...
00665 
00666       CharT target=0;
00667       PRBool    done=PR_FALSE;
00668 
00669       while((!done) && (cp++<mEndBuffer)) {
00670         switch(*cp) {
00671           case kSpace:  case kNewLine:
00672           case kCR:     case kTab:
00673           case kEqual:
00674             continue;
00675 
00676           case kQuote:
00677             target=*cp;
00678             if (aSkipQuotes) {
00679               ++cp;
00680             }
00681             done=PR_TRUE;
00682             break;
00683 
00684           case kMinus:
00685             target=*cp;
00686             done=PR_TRUE;
00687             break;
00688 
00689           default:
00690             done=PR_TRUE;
00691             break;
00692         }
00693       }
00694 
00695       if(cp<mEndBuffer) {  
00696 
00697         const CharT *firstcp=cp; //hang onto this...      
00698         PRInt32 theDashCount=2;
00699 
00700         ++cp; //just skip first letter to simplify processing...
00701 
00702         //ok, now find end of this word
00703         while(cp++<mEndBuffer) {
00704           if(kQuote==target) {
00705             if(kQuote==*cp) {
00706               ++cp;
00707               break; //we found our end...
00708             }
00709           }
00710           else if(kMinus==target) {
00711             //then let's look for SGML comments
00712             if(kMinus==*cp) {
00713               if(4==++theDashCount) {
00714                 ++cp;
00715                 break;
00716               }
00717             }
00718           }
00719           else {
00720             if((kSpace==*cp) ||
00721                (kNewLine==*cp) ||
00722                (kGreaterThan==*cp) ||
00723                (kQuote==*cp) ||
00724                (kCR==*cp) ||
00725                (kTab==*cp) || 
00726                (kEqual == *cp)) {
00727               break;
00728             }
00729           }
00730         }
00731 
00732         mLength=cp-firstcp;
00733         mOffset = (0<mLength) ? firstcp-mBuffer : -1;
00734 
00735       }
00736     }
00737 
00738     return mOffset;
00739   }
00740 
00741   PRInt32 GetLength() const {
00742     return mLength;
00743   }
00744 
00745   PRInt32     mOffset;
00746   PRInt32     mMaxOffset;
00747   PRInt32     mLength;
00748   const CharT*  mBuffer;
00749   const CharT*  mEndBuffer;
00750 };
00751 
00762 // Parse the PS production in the SGML spec (excluding the part dealing
00763 // with entity references) starting at theIndex into theBuffer, and
00764 // return the first index after the end of the production.
00765 static PRInt32 ParsePS(const nsString& aBuffer, PRInt32 aIndex)
00766 {
00767   for(;;) {
00768     PRUnichar ch = aBuffer.CharAt(aIndex);
00769     if ((ch == PRUnichar(' ')) || (ch == PRUnichar('\t')) ||
00770         (ch == PRUnichar('\n')) || (ch == PRUnichar('\r'))) {
00771       ++aIndex;
00772     } else if (ch == PRUnichar('-')) {
00773       PRInt32 tmpIndex;
00774       if (aBuffer.CharAt(aIndex+1) == PRUnichar('-') &&
00775           kNotFound != (tmpIndex=aBuffer.Find("--",PR_FALSE,aIndex+2,-1))) {
00776         aIndex = tmpIndex + 2;
00777       } else {
00778         return aIndex;
00779       }
00780     } else {
00781       return aIndex;
00782     }
00783   }
00784 }
00785 
00786 #define PARSE_DTD_HAVE_DOCTYPE          (1<<0)
00787 #define PARSE_DTD_HAVE_PUBLIC_ID        (1<<1)
00788 #define PARSE_DTD_HAVE_SYSTEM_ID        (1<<2)
00789 #define PARSE_DTD_HAVE_INTERNAL_SUBSET  (1<<3)
00790 
00791 // return PR_TRUE on success (includes not present), PR_FALSE on failure
00792 static PRBool ParseDocTypeDecl(const nsString &aBuffer,
00793                                PRInt32 *aResultFlags,
00794                                nsString &aPublicID,
00795                                nsString &aSystemID)
00796 {
00797   PRBool haveDoctype = PR_FALSE;
00798   *aResultFlags = 0;
00799 
00800   // Skip through any comments and processing instructions
00801   // The PI-skipping is a bit of a hack.
00802   PRInt32 theIndex = 0;
00803   do {
00804     theIndex = aBuffer.FindChar('<', theIndex);
00805     if (theIndex == kNotFound) break;
00806     PRUnichar nextChar = aBuffer.CharAt(theIndex+1);
00807     if (nextChar == PRUnichar('!')) {
00808       PRInt32 tmpIndex = theIndex + 2;
00809       if (kNotFound !=
00810           (theIndex=aBuffer.Find("DOCTYPE", PR_TRUE, tmpIndex, 1))) {
00811         haveDoctype = PR_TRUE;
00812         theIndex += 7; // skip "DOCTYPE"
00813         break;
00814       }
00815       theIndex = ParsePS(aBuffer, tmpIndex);
00816       theIndex = aBuffer.FindChar('>', theIndex);
00817     } else if (nextChar == PRUnichar('?')) {
00818       theIndex = aBuffer.FindChar('>', theIndex);
00819     } else {
00820       break;
00821     }
00822   } while (theIndex != kNotFound);
00823 
00824   if (!haveDoctype)
00825     return PR_TRUE;
00826   *aResultFlags |= PARSE_DTD_HAVE_DOCTYPE;
00827 
00828   theIndex = ParsePS(aBuffer, theIndex);
00829   theIndex = aBuffer.Find("HTML", PR_TRUE, theIndex, 1);
00830   if(kNotFound == theIndex)
00831     return PR_FALSE;
00832   theIndex = ParsePS(aBuffer, theIndex+4);
00833   PRInt32 tmpIndex = aBuffer.Find("PUBLIC", PR_TRUE, theIndex, 1);
00834 
00835   if (kNotFound != tmpIndex) {
00836     theIndex = ParsePS(aBuffer, tmpIndex+6);
00837 
00838     // We get here only if we've read <!DOCTYPE HTML PUBLIC
00839     // (not case sensitive) possibly with comments within.
00840 
00841     // Now find the beginning and end of the public identifier
00842     // and the system identifier (if present).
00843 
00844     PRUnichar lit = aBuffer.CharAt(theIndex);
00845     if ((lit != PRUnichar('\"')) && (lit != PRUnichar('\'')))
00846       return PR_FALSE;
00847 
00848     // Start is the first character, excluding the quote, and End is
00849     // the final quote, so there are (end-start) characters.
00850 
00851     PRInt32 PublicIDStart = theIndex + 1;
00852     PRInt32 PublicIDEnd = aBuffer.FindChar(lit, PublicIDStart);
00853     if (kNotFound == PublicIDEnd)
00854       return PR_FALSE;
00855     theIndex = ParsePS(aBuffer, PublicIDEnd + 1);
00856     PRUnichar next = aBuffer.CharAt(theIndex);
00857     if (next == PRUnichar('>')) {
00858       // There was a public identifier, but no system
00859       // identifier,
00860       // so do nothing.
00861       // This is needed to avoid the else at the end, and it's
00862       // also the most common case.
00863     } else if ((next == PRUnichar('\"')) ||
00864                (next == PRUnichar('\''))) {
00865       // We found a system identifier.
00866       *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
00867       PRInt32 SystemIDStart = theIndex + 1;
00868       PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
00869       if (kNotFound == SystemIDEnd)
00870         return PR_FALSE;
00871       aSystemID =
00872         Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
00873     } else if (next == PRUnichar('[')) {
00874       // We found an internal subset.
00875       *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
00876     } else {
00877       // Something's wrong.
00878       return PR_FALSE;
00879     }
00880 
00881     // Since a public ID is a minimum literal, we must trim
00882     // and collapse whitespace
00883     aPublicID = Substring(aBuffer, PublicIDStart, PublicIDEnd - PublicIDStart);
00884     aPublicID.CompressWhitespace(PR_TRUE, PR_TRUE);
00885     *aResultFlags |= PARSE_DTD_HAVE_PUBLIC_ID;
00886   } else {
00887     tmpIndex=aBuffer.Find("SYSTEM", PR_TRUE, theIndex, 1);
00888     if (kNotFound != tmpIndex) {
00889       // DOCTYPES with system ID but no Public ID
00890       *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
00891       
00892       theIndex = ParsePS(aBuffer, tmpIndex+6);
00893       PRUnichar next = aBuffer.CharAt(theIndex);
00894       if (next != PRUnichar('\"') && next != PRUnichar('\''))
00895         return PR_FALSE;
00896 
00897       PRInt32 SystemIDStart = theIndex + 1;
00898       PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
00899 
00900       if (kNotFound == SystemIDEnd)
00901         return PR_FALSE;
00902       aSystemID =
00903         Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
00904       theIndex = ParsePS(aBuffer, SystemIDEnd + 1);
00905     }
00906 
00907     PRUnichar nextChar = aBuffer.CharAt(theIndex);
00908     if (nextChar == PRUnichar('['))
00909       *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
00910     else if (nextChar != PRUnichar('>'))
00911       return PR_FALSE;
00912   }
00913   return PR_TRUE;
00914 }
00915 
00916 struct PubIDInfo {
00917   enum eMode {
00918     eQuirks,         /* always quirks mode, unless there's an internal subset */
00919     eQuirks3,        /* ditto, but but pre-HTML4 (no tbody) */
00920     eAlmostStandards,/* eCompatibility_AlmostStandards */
00921     eFullStandards   /* eCompatibility_FullStandards */
00922       /*
00923        * public IDs that should trigger strict mode are not listed
00924        * since we want all future public IDs to trigger strict mode as
00925        * well
00926        */
00927   };
00928 
00929   const char* name;
00930   eMode mode_if_no_sysid;
00931   eMode mode_if_sysid;
00932 };
00933 
00934 #define ELEMENTS_OF(array_) (sizeof(array_)/sizeof(array_[0]))
00935 
00936 // These must be in nsCRT::strcmp order so binary-search can be used.
00937 // This is verified, |#ifdef DEBUG|, below.
00938 
00939 // Even though public identifiers should be case sensitive, we will do
00940 // all comparisons after converting to lower case in order to do
00941 // case-insensitive comparison since there are a number of existing web
00942 // sites that use the incorrect case.  Therefore all of the public
00943 // identifiers below are in lower case (with the correct case following,
00944 // in comments).  The case is verified, |#ifdef DEBUG|, below.
00945 static const PubIDInfo kPublicIDs[] = {
00946   {"+//silmaril//dtd html pro v0r11 19970101//en" /* "+//Silmaril//dtd html Pro v0r11 19970101//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00947   {"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en" /* "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00948   {"-//as//dtd html 3.0 aswedit + extensions//en" /* "-//AS//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00949   {"-//ietf//dtd html 2.0 level 1//en" /* "-//IETF//DTD HTML 2.0 Level 1//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00950   {"-//ietf//dtd html 2.0 level 2//en" /* "-//IETF//DTD HTML 2.0 Level 2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00951   {"-//ietf//dtd html 2.0 strict level 1//en" /* "-//IETF//DTD HTML 2.0 Strict Level 1//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00952   {"-//ietf//dtd html 2.0 strict level 2//en" /* "-//IETF//DTD HTML 2.0 Strict Level 2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00953   {"-//ietf//dtd html 2.0 strict//en" /* "-//IETF//DTD HTML 2.0 Strict//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00954   {"-//ietf//dtd html 2.0//en" /* "-//IETF//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00955   {"-//ietf//dtd html 2.1e//en" /* "-//IETF//DTD HTML 2.1E//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00956   {"-//ietf//dtd html 3.0//en" /* "-//IETF//DTD HTML 3.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00957   {"-//ietf//dtd html 3.0//en//" /* "-//IETF//DTD HTML 3.0//EN//" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00958   {"-//ietf//dtd html 3.2 final//en" /* "-//IETF//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00959   {"-//ietf//dtd html 3.2//en" /* "-//IETF//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00960   {"-//ietf//dtd html 3//en" /* "-//IETF//DTD HTML 3//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00961   {"-//ietf//dtd html level 0//en" /* "-//IETF//DTD HTML Level 0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00962   {"-//ietf//dtd html level 0//en//2.0" /* "-//IETF//DTD HTML Level 0//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00963   {"-//ietf//dtd html level 1//en" /* "-//IETF//DTD HTML Level 1//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00964   {"-//ietf//dtd html level 1//en//2.0" /* "-//IETF//DTD HTML Level 1//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00965   {"-//ietf//dtd html level 2//en" /* "-//IETF//DTD HTML Level 2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00966   {"-//ietf//dtd html level 2//en//2.0" /* "-//IETF//DTD HTML Level 2//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00967   {"-//ietf//dtd html level 3//en" /* "-//IETF//DTD HTML Level 3//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00968   {"-//ietf//dtd html level 3//en//3.0" /* "-//IETF//DTD HTML Level 3//EN//3.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00969   {"-//ietf//dtd html strict level 0//en" /* "-//IETF//DTD HTML Strict Level 0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00970   {"-//ietf//dtd html strict level 0//en//2.0" /* "-//IETF//DTD HTML Strict Level 0//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00971   {"-//ietf//dtd html strict level 1//en" /* "-//IETF//DTD HTML Strict Level 1//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00972   {"-//ietf//dtd html strict level 1//en//2.0" /* "-//IETF//DTD HTML Strict Level 1//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00973   {"-//ietf//dtd html strict level 2//en" /* "-//IETF//DTD HTML Strict Level 2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00974   {"-//ietf//dtd html strict level 2//en//2.0" /* "-//IETF//DTD HTML Strict Level 2//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00975   {"-//ietf//dtd html strict level 3//en" /* "-//IETF//DTD HTML Strict Level 3//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00976   {"-//ietf//dtd html strict level 3//en//3.0" /* "-//IETF//DTD HTML Strict Level 3//EN//3.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00977   {"-//ietf//dtd html strict//en" /* "-//IETF//DTD HTML Strict//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00978   {"-//ietf//dtd html strict//en//2.0" /* "-//IETF//DTD HTML Strict//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00979   {"-//ietf//dtd html strict//en//3.0" /* "-//IETF//DTD HTML Strict//EN//3.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00980   {"-//ietf//dtd html//en" /* "-//IETF//DTD HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00981   {"-//ietf//dtd html//en//2.0" /* "-//IETF//DTD HTML//EN//2.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00982   {"-//ietf//dtd html//en//3.0" /* "-//IETF//DTD HTML//EN//3.0" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00983   {"-//metrius//dtd metrius presentational//en" /* "-//Metrius//DTD Metrius Presentational//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
00984   {"-//microsoft//dtd internet explorer 2.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00985   {"-//microsoft//dtd internet explorer 2.0 html//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00986   {"-//microsoft//dtd internet explorer 2.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 2.0 Tables//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00987   {"-//microsoft//dtd internet explorer 3.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00988   {"-//microsoft//dtd internet explorer 3.0 html//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00989   {"-//microsoft//dtd internet explorer 3.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 3.0 Tables//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00990   {"-//netscape comm. corp.//dtd html//en" /* "-//Netscape Comm. Corp.//DTD HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00991   {"-//netscape comm. corp.//dtd strict html//en" /* "-//Netscape Comm. Corp.//DTD Strict HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00992   {"-//o'reilly and associates//dtd html 2.0//en" /* "-//O'Reilly and Associates//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00993   {"-//o'reilly and associates//dtd html extended 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended 1.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00994   {"-//o'reilly and associates//dtd html extended relaxed 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00995   {"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//en" /* "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
00996   {"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//en" /* "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
00997   {"-//spyglass//dtd html 2.0 extended//en" /* "-//Spyglass//DTD HTML 2.0 Extended//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00998   {"-//sq//dtd html 2.0 hotmetal + extensions//en" /* "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
00999   {"-//sun microsystems corp.//dtd hotjava html//en" /* "-//Sun Microsystems Corp.//DTD HotJava HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01000   {"-//sun microsystems corp.//dtd hotjava strict html//en" /* "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01001   {"-//w3c//dtd html 3 1995-03-24//en" /* "-//W3C//DTD HTML 3 1995-03-24//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01002   {"-//w3c//dtd html 3.2 draft//en" /* "-//W3C//DTD HTML 3.2 Draft//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01003   {"-//w3c//dtd html 3.2 final//en" /* "-//W3C//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01004   {"-//w3c//dtd html 3.2//en" /* "-//W3C//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01005   {"-//w3c//dtd html 3.2s draft//en" /* "-//W3C//DTD HTML 3.2S Draft//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01006   {"-//w3c//dtd html 4.0 frameset//en" /* "-//W3C//DTD HTML 4.0 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
01007   {"-//w3c//dtd html 4.0 transitional//en" /* "-//W3C//DTD HTML 4.0 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
01008   {"-//w3c//dtd html 4.01 frameset//en" /* "-//W3C//DTD HTML 4.01 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
01009   {"-//w3c//dtd html 4.01 transitional//en" /* "-//W3C//DTD HTML 4.01 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
01010   {"-//w3c//dtd html experimental 19960712//en" /* "-//W3C//DTD HTML Experimental 19960712//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01011   {"-//w3c//dtd html experimental 970421//en" /* "-//W3C//DTD HTML Experimental 970421//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01012   {"-//w3c//dtd w3 html//en" /* "-//W3C//DTD W3 HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01013   {"-//w3c//dtd xhtml 1.0 frameset//en" /* "-//W3C//DTD XHTML 1.0 Frameset//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
01014   {"-//w3c//dtd xhtml 1.0 transitional//en" /* "-//W3C//DTD XHTML 1.0 Transitional//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
01015   {"-//w3o//dtd w3 html 3.0//en" /* "-//W3O//DTD W3 HTML 3.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01016   {"-//w3o//dtd w3 html 3.0//en//" /* "-//W3O//DTD W3 HTML 3.0//EN//" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01017   {"-//w3o//dtd w3 html strict 3.0//en//" /* "-//W3O//DTD W3 HTML Strict 3.0//EN//" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01018   {"-//webtechs//dtd mozilla html 2.0//en" /* "-//WebTechs//DTD Mozilla HTML 2.0//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01019   {"-//webtechs//dtd mozilla html//en" /* "-//WebTechs//DTD Mozilla HTML//EN" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01020   {"-/w3c/dtd html 4.0 transitional/en" /* "-/W3C/DTD HTML 4.0 Transitional/EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
01021   {"html" /* "HTML" */, PubIDInfo::eQuirks3, PubIDInfo::eQuirks3},
01022 };
01023 
01024 #ifdef DEBUG
01025 static void VerifyPublicIDs()
01026 {
01027   static PRBool gVerified = PR_FALSE;
01028   if (!gVerified) {
01029     gVerified = PR_TRUE;
01030     PRUint32 i;
01031     for (i = 0; i < ELEMENTS_OF(kPublicIDs) - 1; ++i) {
01032       if (nsCRT::strcmp(kPublicIDs[i].name, kPublicIDs[i+1].name) >= 0) {
01033         NS_NOTREACHED("doctypes out of order");
01034         printf("Doctypes %s and %s out of order.\n",
01035                kPublicIDs[i].name, kPublicIDs[i+1].name);
01036       }
01037     }
01038     for (i = 0; i < ELEMENTS_OF(kPublicIDs); ++i) {
01039       nsCAutoString lcPubID(kPublicIDs[i].name);
01040       ToLowerCase(lcPubID);
01041       if (nsCRT::strcmp(kPublicIDs[i].name, lcPubID.get()) != 0) {
01042         NS_NOTREACHED("doctype not lower case");
01043         printf("Doctype %s not lower case.\n", kPublicIDs[i].name);
01044       }
01045     } 
01046   }
01047 }
01048 #endif
01049 
01050 static void DetermineHTMLParseMode(const nsString& aBuffer,
01051                                    nsDTDMode& aParseMode,
01052                                    eParserDocType& aDocType)
01053 {
01054 #ifdef DEBUG
01055   VerifyPublicIDs();
01056 #endif
01057   PRInt32 resultFlags;
01058   nsAutoString publicIDUCS2, sysIDUCS2;
01059   if (ParseDocTypeDecl(aBuffer, &resultFlags, publicIDUCS2, sysIDUCS2)) {
01060     if (!(resultFlags & PARSE_DTD_HAVE_DOCTYPE)) {
01061 
01062       // no DOCTYPE
01063       aParseMode = eDTDMode_quirks;
01064       aDocType = eHTML_Quirks;
01065 
01066     } else if ((resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) ||
01067                !(resultFlags & PARSE_DTD_HAVE_PUBLIC_ID)) {
01068 
01069       // A doctype with an internal subset is always full_standards.
01070       // A doctype without a public ID is always full_standards.
01071       aDocType = eHTML_Strict;
01072       aParseMode = eDTDMode_full_standards;
01073 
01074       // Special hack for IBM's custom DOCTYPE.
01075       if (!(resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) &&
01076           sysIDUCS2 == NS_LITERAL_STRING(
01077                "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")) {
01078         aParseMode = eDTDMode_quirks;
01079         aDocType = eHTML_Quirks;
01080       }
01081 
01082     } else {
01083 
01084       // We have to check our list of public IDs to see what to do.
01085 
01086       // Yes, we want UCS2 to ASCII lossy conversion.
01087       nsCAutoString publicID;
01088       publicID.AssignWithConversion(publicIDUCS2);
01089 
01090       // See comment above definition of kPublicIDs about case
01091       // sensitivity.
01092       ToLowerCase(publicID);
01093 
01094       // binary search to see if we can find the correct public ID
01095         // These must be signed since maximum can go below zero and we'll
01096         // crash if it's unsigned.
01097       PRInt32 minimum = 0;
01098       PRInt32 maximum = ELEMENTS_OF(kPublicIDs) - 1;
01099       PRInt32 index;
01100       for (;;) {
01101         index = (minimum + maximum) / 2;
01102         PRInt32 comparison =
01103             nsCRT::strcmp(publicID.get(), kPublicIDs[index].name);
01104         if (comparison == 0)
01105           break;
01106         if (comparison < 0)
01107           maximum = index - 1;
01108         else
01109           minimum = index + 1;
01110 
01111         if (maximum < minimum) {
01112           // The DOCTYPE is not in our list, so it must be full_standards.
01113           aParseMode = eDTDMode_full_standards;
01114           aDocType = eHTML_Strict;
01115           return;
01116         }
01117       }
01118 
01119       switch ((resultFlags & PARSE_DTD_HAVE_SYSTEM_ID)
01120                 ? kPublicIDs[index].mode_if_sysid
01121                 : kPublicIDs[index].mode_if_no_sysid)
01122       {
01123         case PubIDInfo::eQuirks3:
01124           aParseMode = eDTDMode_quirks;
01125           aDocType = eHTML3_Quirks;
01126           break;
01127         case PubIDInfo::eQuirks:
01128           aParseMode = eDTDMode_quirks;
01129           aDocType = eHTML_Quirks;
01130           break;
01131         case PubIDInfo::eAlmostStandards:
01132           aParseMode = eDTDMode_almost_standards;
01133           aDocType = eHTML_Strict;
01134           break;
01135         case PubIDInfo::eFullStandards:
01136           aParseMode = eDTDMode_full_standards;
01137           aDocType = eHTML_Strict;
01138           break;
01139         default:
01140           NS_NOTREACHED("no other cases!");
01141       }
01142 
01143     }
01144   } else {
01145     // badly formed DOCTYPE -> quirks
01146     aParseMode = eDTDMode_quirks;
01147     aDocType = eHTML3_Quirks;
01148   }
01149 }
01150 
01151 static 
01152 void DetermineParseMode(const nsString& aBuffer,
01153                         nsDTDMode& aParseMode,
01154                         eParserDocType& aDocType,
01155                         const nsACString& aMimeType)
01156 {
01157   if (aMimeType.EqualsLiteral(kHTMLTextContentType)) {
01158     DetermineHTMLParseMode(aBuffer, aParseMode, aDocType);
01159   } else if (aMimeType.EqualsLiteral(kPlainTextContentType) ||
01160              aMimeType.EqualsLiteral(kTextCSSContentType) ||
01161              aMimeType.EqualsLiteral(kApplicationJSContentType) ||
01162              aMimeType.EqualsLiteral(kApplicationXJSContentType) ||
01163              aMimeType.EqualsLiteral(kTextECMAScriptContentType) ||
01164              aMimeType.EqualsLiteral(kApplicationECMAScriptContentType) ||
01165              aMimeType.EqualsLiteral(kTextJSContentType)) {
01166     aDocType = ePlainText;
01167     aParseMode = eDTDMode_quirks;
01168   } else { // Some form of XML
01169     aDocType = eXML;
01170     aParseMode = eDTDMode_full_standards;
01171   }
01172 }
01173 
01181 static
01182 nsresult
01183 FindSuitableDTD(CParserContext& aParserContext,
01184                 PRBool* aReturn)
01185 {
01186   *aReturn = PR_FALSE;
01187   //Let's start by trying the defaultDTD, if one exists...
01188   if(aParserContext.mDTD) {
01189     eAutoDetectResult canParse = aParserContext.mDTD->CanParse(aParserContext);
01190     if(canParse != eUnknownDetect && canParse != eInvalidDetect)
01191       return PR_TRUE;
01192   }
01193 
01194   CSharedParserObjects* sharedObjects;
01195   nsresult rv = GetSharedObjects(&sharedObjects);
01196   NS_ENSURE_SUCCESS(rv, rv);
01197 
01198   aParserContext.mAutoDetectStatus = eUnknownDetect;
01199   PRInt32 theDTDIndex = 0;
01200   nsIDTD* theBestDTD  = 0;
01201   nsIDTD* theDTD      = 0;
01202   PRBool  thePrimaryFound = PR_FALSE;
01203 
01204   while ((theDTDIndex <= sharedObjects->mDTDDeque.GetSize()) && 
01205          (aParserContext.mAutoDetectStatus != ePrimaryDetect)){
01206     theDTD = NS_STATIC_CAST(nsIDTD*, sharedObjects->mDTDDeque.ObjectAt(theDTDIndex++));
01207     if (theDTD) {
01208       // Store detect status in temp ( theResult ) to avoid bugs such as
01209       // 36233, 36754, 36491, 36323. Basically, we should avoid calling DTD's
01210       // WillBuildModel() multiple times, i.e., we shouldn't leave auto-detect-status
01211       // unknown.
01212       eAutoDetectResult theResult = theDTD->CanParse(aParserContext);
01213       if (eValidDetect == theResult){
01214         aParserContext.mAutoDetectStatus = eValidDetect;
01215         theBestDTD = theDTD;
01216       }
01217       else if (ePrimaryDetect == theResult) {  
01218         theBestDTD = theDTD;
01219         thePrimaryFound = PR_TRUE;
01220         aParserContext.mAutoDetectStatus = ePrimaryDetect;
01221       }
01222     }
01223     if (theDTDIndex == sharedObjects->mDTDDeque.GetSize() && !thePrimaryFound) {
01224       if (!sharedObjects->mHasXMLDTD) {
01225         rv = NS_NewExpatDriver(&theDTD); //do this to view XML files...
01226         NS_ENSURE_SUCCESS(rv, rv);
01227 
01228         sharedObjects->mDTDDeque.Push(theDTD);
01229         sharedObjects->mHasXMLDTD = PR_TRUE;
01230       }
01231 #ifdef MOZ_VIEW_SOURCE
01232       else if (!sharedObjects->mHasViewSourceDTD) {
01233         rv = NS_NewViewSourceHTML(&theDTD);  //do this so all non-html files can be viewed...
01234         NS_ENSURE_SUCCESS(rv, rv);
01235         
01236         sharedObjects->mDTDDeque.Push(theDTD);
01237         sharedObjects->mHasViewSourceDTD = PR_TRUE;
01238       }
01239 #endif
01240     }
01241   }
01242 
01243   if(theBestDTD) {
01244     rv = theBestDTD->CreateNewInstance(&aParserContext.mDTD);
01245     NS_ENSURE_SUCCESS(rv, rv);
01246 
01247     *aReturn = PR_TRUE;
01248   }
01249 
01250   return rv;
01251 }
01252 
01253 NS_IMETHODIMP 
01254 nsParser::CancelParsingEvents()
01255 {
01256   if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) {
01257     NS_ASSERTION(mEventQueue,"Event queue is null");
01258     // Revoke all pending continue parsing events 
01259     if (mEventQueue != nsnull) {
01260       mEventQueue->RevokeEvents(this);
01261     } 
01262 
01263     mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
01264   }
01265   return NS_OK;
01266 }
01267 
01269 
01270 
01278 nsresult 
01279 nsParser::WillBuildModel(nsString& aFilename)
01280 {
01281   if (!mParserContext)
01282     return kInvalidParserContext;
01283 
01284   if (eUnknownDetect != mParserContext->mAutoDetectStatus)
01285     return NS_OK;
01286 
01287   if (eDTDMode_unknown == mParserContext->mDTDMode ||
01288       eDTDMode_autodetect == mParserContext->mDTDMode) {
01289     PRUnichar buf[1025];
01290     nsFixedString theBuffer(buf, 1024, 0);
01291 
01292     // Grab 1024 characters, starting at the first non-whitespace
01293     // character, to look for the doctype in.
01294     mParserContext->mScanner->Peek(theBuffer, 1024, mParserContext->mScanner->FirstNonWhitespacePosition());    
01295     DetermineParseMode(theBuffer, mParserContext->mDTDMode,
01296                        mParserContext->mDocType, mParserContext->mMimeType);
01297   }
01298   
01299   PRBool found;
01300   nsresult rv = FindSuitableDTD(*mParserContext, &found);
01301   NS_ENSURE_SUCCESS(rv, rv);
01302 
01303   if (!found)
01304     return rv;
01305 
01306   nsITokenizer* tokenizer;
01307   mParserContext->GetTokenizer(mParserContext->mDTD->GetType(), mSink, tokenizer);
01308   return mParserContext->mDTD->WillBuildModel(*mParserContext, tokenizer, mSink);
01309 }
01310 
01319 nsresult nsParser::DidBuildModel(nsresult anErrorCode) {
01320   //One last thing...close any open containers.
01321   nsresult result=anErrorCode;
01322 
01323   if (IsComplete()) {
01324     if (mParserContext && !mParserContext->mPrevContext) {
01325       if (mParserContext->mDTD) {
01326         result = mParserContext->mDTD->DidBuildModel(anErrorCode,PR_TRUE,this,mSink);
01327       }
01328       //Ref. to bug 61462.
01329       mParserContext->mRequest = 0;
01330     }//if
01331   }
01332 
01333   return result;
01334 }
01335 
01336 
01344 void nsParser::PushContext(CParserContext& aContext) {
01345   aContext.mPrevContext=mParserContext;  
01346   mParserContext=&aContext;
01347 }
01348 
01356 CParserContext* nsParser::PopContext() 
01357 {
01358   CParserContext* oldContext = mParserContext;
01359   if (oldContext) {
01360     mParserContext = oldContext->mPrevContext;
01361     if (mParserContext) {
01362       // If the old context was blocked, propagate the blocked state
01363       // back to the new one. Also, propagate the stream listener state
01364       // but don't override onStop state to guarantee the call to DidBuildModel().
01365       if (mParserContext->mStreamListenerState != eOnStop) {
01366         mParserContext->mStreamListenerState = oldContext->mStreamListenerState;
01367       }
01368       // Update the current context's tokenizer to any information gleaned
01369       // while parsing document.write() calls (such as "a plaintext tag was
01370       // found")
01371       if (mParserContext->mTokenizer) {
01372         mParserContext->mTokenizer->CopyState(oldContext->mTokenizer);
01373       }
01374     }
01375   }
01376   return oldContext;
01377 }
01378 
01388 void nsParser::SetUnusedInput(nsString& aBuffer)
01389 {
01390   mUnusedInput=aBuffer;
01391 }
01392 
01401 NS_IMETHODIMP nsParser::Terminate(void)
01402 {
01403   nsresult result = NS_OK;
01404   // XXX - [ until we figure out a way to break parser-sink circularity ]
01405   // Hack - Hold a reference until we are completely done...
01406   nsCOMPtr<nsIParser> kungFuDeathGrip(this); 
01407   mInternalState = result = NS_ERROR_HTMLPARSER_STOPPARSING;
01408 
01409   // CancelParsingEvents must be called to avoid leaking the nsParser object
01410   // @see bug 108049
01411   // If NS_PARSER_FLAG_PENDING_CONTINUE_EVENT is set then CancelParsingEvents 
01412   // will reset it so DidBuildModel will call DidBuildModel on the DTD. Note: 
01413   // The IsComplete() call inside of DidBuildModel looks at the pendingContinueEvents flag.
01414   CancelParsingEvents();
01415 
01416   // If we got interrupted in the middle of a document.write, then we might
01417   // have more than one parser context on our parsercontext stack. This has
01418   // the effect of making DidBuildModel a no-op, meaning that we never call
01419   // our sink's DidBuildModel and break the reference cycle, causing a leak.
01420   // Since we're getting terminated, we manually clean up our context stack.
01421   while (mParserContext && mParserContext->mPrevContext) {
01422     CParserContext *prev = mParserContext->mPrevContext;
01423     NS_ASSERTION(prev->mPrevContext || prev->mDTD, "How is there no root DTD?");
01424 
01425     delete mParserContext;
01426     mParserContext = prev;
01427   }
01428 
01429   if (mParserContext && mParserContext->mDTD) {
01430     mParserContext->mDTD->Terminate();
01431     DidBuildModel(result);
01432   }
01433   else if (mSink) {
01434     // We have no parser context or no DTD yet (so we got terminated before we
01435     // got any data).  Manually break the reference cycle with the sink.
01436     result = mSink->DidBuildModel();
01437     NS_ENSURE_SUCCESS(result, result);
01438   }
01439   return NS_OK;
01440 }
01441 
01442 
01449 NS_IMETHODIMP nsParser::ContinueParsing()
01450 {    
01451   if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) {
01452     NS_WARNING("Trying to continue parsing on a unblocked parser.");
01453     return NS_OK;
01454   }
01455 
01456   mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
01457 
01458   return ContinueInterruptedParsing();
01459 }
01460 
01461 NS_IMETHODIMP nsParser::ContinueInterruptedParsing()
01462 {
01463   // If the stream has already finished, there's a good chance
01464   // that we might start closing things down when the parser
01465   // is reenabled. To make sure that we're not deleted across
01466   // the reenabling process, hold a reference to ourselves.
01467   nsresult result=NS_OK;
01468   nsCOMPtr<nsIParser> kungFuDeathGrip(this);
01469 
01470 #ifdef DEBUG
01471   if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
01472     NS_WARNING("Don't call ContinueInterruptedParsing on a blocked parser.");
01473   }
01474 #endif
01475 
01476   PRBool isFinalChunk = (mParserContext &&
01477                           mParserContext->mStreamListenerState==eOnStop) ?
01478                           PR_TRUE : PR_FALSE;
01479   
01480   result=ResumeParse(PR_TRUE,isFinalChunk); // Ref. bug 57999
01481   
01482   if(result!=NS_OK) 
01483     result=mInternalState;
01484   
01485   return result;
01486 }
01487 
01495 NS_IMETHODIMP_(void) nsParser::BlockParser()
01496 {
01497   mFlags &= ~NS_PARSER_FLAG_PARSER_ENABLED;
01498   MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::BlockParser(), this=%p\n", this));
01499   MOZ_TIMER_STOP(mParseTime);
01500 }
01501 
01511 NS_IMETHODIMP_(void) nsParser::UnblockParser()
01512 {
01513   if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
01514     mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
01515     MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::UnblockParser(), this=%p\n", this));
01516     MOZ_TIMER_START(mParseTime);
01517   }
01518   else {
01519     NS_WARNING("Trying to unblock an unblocked parser.");
01520   }
01521 }
01522 
01529 NS_IMETHODIMP_(PRBool) nsParser::IsParserEnabled()
01530 {
01531   return mFlags & NS_PARSER_FLAG_PARSER_ENABLED;
01532 }
01533 
01540 NS_IMETHODIMP_(PRBool) nsParser::IsComplete()
01541 {
01542   return !(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT);
01543 }
01544 
01545 
01546 void nsParser::HandleParserContinueEvent() {
01547   mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
01548   ContinueInterruptedParsing();
01549 }
01550 
01551 nsresult nsParser::DataAdded(const nsSubstring& aData, nsIRequest *aRequest)
01552 {
01553   NS_ASSERTION(sParserDataListeners,
01554                "Don't call this with no parser data listeners!");
01555 
01556   if (!mSink || !aRequest) {
01557     return NS_OK;
01558   }
01559 
01560   nsISupports *ctx = mSink->GetTarget();
01561   PRInt32 count = sParserDataListeners->Count();
01562   nsresult rv = NS_OK;
01563   PRBool canceled = PR_FALSE;
01564 
01565   while (count--) {
01566     rv |= sParserDataListeners->ObjectAt(count)->
01567       OnUnicharDataAvailable(aRequest, ctx, aData);
01568 
01569     if (NS_FAILED(rv) && !canceled) {
01570       aRequest->Cancel(rv);
01571 
01572       canceled = PR_TRUE;
01573     }
01574   }
01575 
01576   return rv;
01577 }
01578 
01579 PRBool nsParser::CanInterrupt(void) {
01580   return mFlags & NS_PARSER_FLAG_CAN_INTERRUPT;
01581 }
01582 
01583 void nsParser::SetCanInterrupt(PRBool aCanInterrupt) {
01584   if (aCanInterrupt) {
01585     mFlags |= NS_PARSER_FLAG_CAN_INTERRUPT;
01586   }
01587   else {
01588     mFlags &= ~NS_PARSER_FLAG_CAN_INTERRUPT;
01589   }
01590 }
01591 
01603 NS_IMETHODIMP
01604 nsParser::Parse(nsIURI* aURL,
01605                 nsIRequestObserver* aListener,
01606                 PRBool aVerifyEnabled,
01607                 void* aKey,
01608                 nsDTDMode aMode)
01609 {  
01610 
01611   NS_PRECONDITION(aURL, "Error: Null URL given");
01612 
01613   nsresult result=kBadURL;
01614   mObserver = aListener;
01615  
01616   if (aVerifyEnabled) {
01617     mFlags |= NS_PARSER_FLAG_DTD_VERIFICATION;
01618   }
01619   else {
01620     mFlags &= ~NS_PARSER_FLAG_DTD_VERIFICATION;
01621   }
01622 
01623   if(aURL) {
01624     nsCAutoString spec;
01625     nsresult rv = aURL->GetSpec(spec);
01626     if (rv != NS_OK) {      
01627       return rv;
01628     }
01629     NS_ConvertUTF8toUCS2 theName(spec);
01630 
01631     nsScanner* theScanner=new nsScanner(theName,PR_FALSE,mCharset,mCharsetSource);
01632     CParserContext* pc=new CParserContext(theScanner,aKey,mCommand,aListener);
01633     if(pc && theScanner) {
01634       pc->mMultipart=PR_TRUE;
01635       pc->mContextType=CParserContext::eCTURL;
01636       pc->mDTDMode=aMode;
01637       PushContext(*pc);
01638 
01639       // Here, and only here, hand this parser off to the scanner. We
01640       // only want to do that here since the only reason the scanner
01641       // needs the parser is to call DataAdded() on it, and that's
01642       // only ever wanted when parsing from an URI.
01643       theScanner->SetParser(this);
01644 
01645       result=NS_OK;
01646     }
01647     else{
01648       result=mInternalState=NS_ERROR_HTMLPARSER_BADCONTEXT;
01649     }
01650   }  
01651   return result;
01652 }
01653 
01654 
01661 NS_IMETHODIMP
01662 nsParser::Parse(nsIInputStream* aStream,
01663                 const nsACString& aMimeType,
01664                 PRBool aVerifyEnabled,
01665                 void* aKey,
01666                 nsDTDMode aMode)
01667 {
01668   if (aVerifyEnabled) {
01669     mFlags |= NS_PARSER_FLAG_DTD_VERIFICATION;
01670   }
01671   else {
01672     mFlags &= ~NS_PARSER_FLAG_DTD_VERIFICATION;
01673   }
01674   
01675   nsresult  result=NS_ERROR_OUT_OF_MEMORY;
01676 
01677   //ok, time to create our tokenizer and begin the process
01678   nsAutoString theUnknownFilename(NS_LITERAL_STRING("unknown"));
01679 
01680   // references 
01681   nsScanner* theScanner=new nsScanner(theUnknownFilename,aStream,mCharset,mCharsetSource);
01682 
01683   CParserContext* pc=new CParserContext(theScanner,aKey,mCommand,0);
01684   if(pc && theScanner) {
01685     PushContext(*pc);
01686     pc->SetMimeType(aMimeType);
01687     pc->mStreamListenerState=eOnStart;  
01688     pc->mMultipart=PR_FALSE;
01689     pc->mContextType=CParserContext::eCTStream;
01690     pc->mDTDMode=aMode;
01691     mParserContext->mScanner->FillBuffer();
01692     result=ResumeParse();
01693     pc=PopContext();
01694     delete pc;
01695   }
01696   else{
01697     result=mInternalState=NS_ERROR_HTMLPARSER_BADCONTEXT;
01698   }  
01699   return result;
01700 }
01701 
01712 NS_IMETHODIMP
01713 nsParser::Parse(const nsAString& aSourceBuffer,
01714                 void* aKey,
01715                 const nsACString& aMimeType,
01716                 PRBool aVerifyEnabled,
01717                 PRBool aLastCall,
01718                 nsDTDMode aMode)
01719 { 
01720 
01721   //NOTE: Make sure that updates to this method don't cause 
01722   //      bug #2361 to break again! 
01723 
01724   nsresult result=NS_OK;
01725 
01726   // Don't bother if we're never going to parse this.
01727   if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
01728     return result;
01729   }
01730 
01731   if(!aLastCall && aSourceBuffer.IsEmpty()) {
01732     // Nothing is being passed to the parser so return
01733     // immediately. mUnusedInput will get processed when
01734     // some data is actually passed in.
01735     // But if this is the last call, make sure to finish up
01736     // stuff correctly.
01737     return result;
01738   }
01739 
01740   // hack to pass on to the dtd the caller's desire to 
01741   // parse a fragment without worrying about containment rules
01742   if (aMode == eDTDMode_fragment)
01743     mCommand = eViewFragment;
01744   
01745   // Maintain a reference to ourselves so we don't go away 
01746   // till we're completely done. 
01747   nsCOMPtr<nsIParser> kungFuDeathGrip(this);
01748 
01749   if(aLastCall || !aSourceBuffer.IsEmpty() || !mUnusedInput.IsEmpty()) {
01750     
01751     if (aVerifyEnabled) {
01752       mFlags |= NS_PARSER_FLAG_DTD_VERIFICATION;
01753     }
01754     else {
01755       mFlags &= ~NS_PARSER_FLAG_DTD_VERIFICATION;
01756     }
01757     
01758     CParserContext* pc=0;
01759 
01760     if((!mParserContext) || (mParserContext->mKey!=aKey))  { 
01761       //only make a new context if we dont have one, OR if we do, but has a different context key... 
01762   
01763       nsScanner* theScanner = new nsScanner(mUnusedInput,mCharset,mCharsetSource);
01764       NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
01765       
01766       nsIDTD *theDTD = 0; 
01767       eAutoDetectResult theStatus = eUnknownDetect; 
01768 
01769       if (mParserContext && mParserContext->mMimeType==aMimeType) {
01770         NS_ASSERTION(mParserContext->mDTD,"How come the DTD is null?"); // Ref. Bug 90379
01771         
01772         if (mParserContext) {
01773           // To fix bug 32263 we used create a new instance of the DTD!.
01774           // All we need is a new tokenizer which now gets created with
01775           // a parser context.
01776           theDTD = mParserContext->mDTD; 
01777           theStatus=mParserContext->mAutoDetectStatus; 
01778           //added this to fix bug 32022.
01779         }
01780       } 
01781 
01782       pc = new CParserContext(theScanner, aKey, mCommand,
01783                               0, theDTD, theStatus, aLastCall);
01784       NS_ENSURE_TRUE(pc, NS_ERROR_OUT_OF_MEMORY);
01785 
01786       PushContext(*pc); 
01787 
01788       pc->mMultipart=!aLastCall; //by default 
01789       if (pc->mPrevContext) { 
01790         pc->mMultipart |= pc->mPrevContext->mMultipart;  //if available 
01791       } 
01792 
01793       // start fix bug 40143
01794       if(pc->mMultipart) {
01795         pc->mStreamListenerState=eOnDataAvail;
01796         if(pc->mScanner) pc->mScanner->SetIncremental(PR_TRUE);
01797       }
01798       else {
01799         pc->mStreamListenerState=eOnStop;
01800         if(pc->mScanner) pc->mScanner->SetIncremental(PR_FALSE);
01801       }
01802       // end fix for 40143
01803 
01804       pc->mContextType=CParserContext::eCTString; 
01805       pc->SetMimeType(aMimeType);
01806       if (pc->mPrevContext && aMode == eDTDMode_autodetect) {
01807         // Preserve the DTD mode from the last context, bug 265814.
01808         pc->mDTDMode = pc->mPrevContext->mDTDMode;
01809       }
01810       else {
01811         pc->mDTDMode = aMode;
01812       }
01813 
01814       mUnusedInput.Truncate(); 
01815 
01816       //printf("Parse(string) iterate: %i",PR_FALSE); 
01817       pc->mScanner->Append(aSourceBuffer); 
01818       // Do not interrupt document.write() - bug 95487
01819       result = ResumeParse(PR_FALSE, PR_FALSE, PR_FALSE);
01820     } 
01821     else { 
01822       mParserContext->mScanner->Append(aSourceBuffer); 
01823       if(!mParserContext->mPrevContext) {
01824         // Set stream listener state to eOnStop, on the final context - Fix 68160,
01825         // to guarantee DidBuildModel() call - Fix 36148
01826         if(aLastCall) {
01827           mParserContext->mStreamListenerState=eOnStop;
01828           mParserContext->mScanner->SetIncremental(PR_FALSE);
01829         }
01830         ResumeParse(PR_FALSE, PR_FALSE, PR_FALSE);
01831       }
01832     } 
01833   }//if 
01834 
01835   return result; 
01836 }  
01837 
01844 NS_IMETHODIMP
01845 nsParser::ParseFragment(const nsAString& aSourceBuffer,
01846                         void* aKey,
01847                         nsVoidArray& aTagStack,
01848                         PRBool aXMLMode,
01849                         const nsACString& aMimeType,
01850                         nsDTDMode aMode)
01851 {
01852   nsresult result = NS_OK;
01853   nsAutoString  theContext;
01854   PRUint32 theCount = aTagStack.Count();
01855   PRUint32 theIndex = 0;
01856 
01857   // Disable observers for fragments
01858   mFlags &= ~NS_PARSER_FLAG_OBSERVERS_ENABLED;
01859 
01860   for (theIndex = 0; theIndex < theCount; theIndex++) {
01861     theContext.AppendLiteral("<");
01862     theContext.Append((PRUnichar*)aTagStack.ElementAt(theCount - theIndex - 1));
01863     theContext.AppendLiteral(">");
01864   }
01865 
01866   // First, parse the context to build up the DTD's tag stack. Note that we
01867   // pass PR_FALSE for the aLastCall parameter.
01868   result = Parse(theContext, (void*)&theContext, aMimeType, 
01869                  PR_FALSE, PR_FALSE, aMode);
01870   if (NS_FAILED(result)) {
01871     mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
01872     return result;
01873   }
01874 
01875   nsCOMPtr<nsIFragmentContentSink> fragSink = do_QueryInterface(mSink);
01876   if (!fragSink) {
01877     NS_ERROR("ParseFragment requires a fragment content sink");
01878     mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
01879     return NS_ERROR_HTMLPARSER_UNKNOWN;
01880   }
01881 
01882   if (!aXMLMode) {
01883     // First, we have to flush any tags that don't belong in the head if there
01884     // was no <body> in the context.
01885     // XXX This is extremely ugly. Maybe CNavDTD should have FlushMisplaced()?
01886     if (!mParserContext) {
01887       NS_ERROR("Parsing didn't create a parser context?");
01888       mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
01889       return NS_ERROR_HTMLPARSER_INVALIDPARSERCONTEXT;
01890     }
01891     nsCOMPtr<CNavDTD> dtd = do_QueryInterface(mParserContext->mDTD);
01892 
01893     if (dtd) {
01894       CStartToken bodyToken(NS_LITERAL_STRING("BODY"), eHTMLTag_body);
01895       nsCParserNode bodyNode(&bodyToken, 0);
01896 
01897       dtd->OpenBody(&bodyNode);
01898 
01899       // Now parse the flushed out tags.
01900       result = BuildModel();
01901       if (NS_FAILED(result)) {
01902         mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
01903         return result;
01904       }
01905     }
01906 
01907     // Now that we've flushed all of the tags out of the body, we have to make
01908     // sure that there aren't any context tags left in the scanner.
01909     NS_ASSERTION(mParserContext->mScanner, "Where'd the scanner go?");
01910 
01911     PRUnichar next;
01912     if (NS_SUCCEEDED(mParserContext->mScanner->Peek(next))) {
01913       // Uh, oh. This must mean that the context stack has a special tag on
01914       // it, such as <textarea> or <title> that requires its end tag before it
01915       // will be consumed. Tell the content sink that it will be coming.
01916       // Note: For now, we can assume that there is only one such tag.
01917       NS_ASSERTION(next == '<', "The tokenizer failed to consume a token");
01918       fragSink->IgnoreFirstContainer();
01919     }
01920   }
01921 
01922   fragSink->WillBuildContent();
01923   // Now, parse the actual content. Note that this is the last call
01924   // for HTML content, but for XML, we will want to build and parse
01925   // the end tags.  However, if tagStack is empty, it's the last call
01926   // for XML as well.
01927   if (!aXMLMode || (theCount == 0)) {
01928     result = Parse(aSourceBuffer, (void*)&theContext, aMimeType,
01929                    PR_FALSE, PR_TRUE, aMode);
01930     fragSink->DidBuildContent();
01931   } else {
01932     // Add an end tag chunk, so expat will read the whole source buffer,
01933     // and not worry about ']]' etc.
01934     result = Parse(aSourceBuffer + NS_LITERAL_STRING("</"),
01935                    (void*)&theContext, aMimeType, PR_FALSE, PR_FALSE, aMode);
01936     fragSink->DidBuildContent();
01937  
01938     if (NS_SUCCEEDED(result)) {
01939       nsAutoString endContext;       
01940       for (theIndex = 0; theIndex < theCount; theIndex++) {
01941          // we already added an end tag chunk above
01942         if (theIndex > 0) {
01943           endContext.AppendLiteral("</");
01944         }
01945 
01946         nsAutoString thisTag( (PRUnichar*)aTagStack.ElementAt(theIndex) );
01947         // was there an xmlns=?
01948         PRInt32 endOfTag = thisTag.FindChar(PRUnichar(' '));
01949         if (endOfTag == -1) {
01950           endContext.Append(thisTag);
01951         } else {
01952           endContext.Append(Substring(thisTag,0,endOfTag));
01953         }
01954 
01955         endContext.AppendLiteral(">");
01956       }
01957        
01958       result = Parse(endContext, (void*)&theContext, aMimeType,
01959                      PR_FALSE, PR_TRUE, aMode);
01960     }
01961   }
01962     
01963   mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED; //now reenable.
01964 
01965   return result;
01966 }
01967 
01968  
01988 nsresult nsParser::ResumeParse(PRBool allowIteration, PRBool aIsFinalChunk, PRBool aCanInterrupt) {
01989 
01990   //printf("  Resume %i, prev-context: %p\n",allowIteration,mParserContext->mPrevContext);
01991   
01992 
01993   nsresult result=NS_OK;
01994 
01995   if((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) && 
01996      mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
01997 
01998 
01999     MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
02000     MOZ_TIMER_START(mParseTime);
02001 
02002     result = WillBuildModel(mParserContext->mScanner->GetFilename());
02003     if (NS_FAILED(result)) {
02004       mFlags &= ~NS_PARSER_FLAG_CAN_TOKENIZE;
02005       return result;
02006     }
02007 
02008     if(mParserContext->mDTD) {
02009 
02010       mParserContext->mDTD->WillResumeParse(mSink);
02011       PRBool theFirstTime=PR_TRUE;
02012       PRBool theIterationIsOk=(theFirstTime || allowIteration||(!mParserContext->mPrevContext));
02013        
02014       while((result==NS_OK) && (theIterationIsOk)) {
02015         theFirstTime=PR_FALSE;
02016         if(!mUnusedInput.IsEmpty()) {
02017           if(mParserContext->mScanner) {
02018             // -- Ref: Bug# 22485 --
02019             // Insert the unused input into the source buffer 
02020             // as if it was read from the input stream. 
02021             // Adding UngetReadable() per vidur!!
02022             mParserContext->mScanner->UngetReadable(mUnusedInput);
02023            mUnusedInput.Truncate(0);
02024           }
02025         }
02026 
02027         //Only allow parsing to be interrupted in the subsequent call
02028         //to build model.
02029         SetCanInterrupt(aCanInterrupt); 
02030         nsresult theTokenizerResult = mFlags & NS_PARSER_FLAG_CAN_TOKENIZE ? Tokenize(aIsFinalChunk) : NS_OK;   // kEOF==2152596456
02031         result=BuildModel(); 
02032 
02033         if(result==NS_ERROR_HTMLPARSER_INTERRUPTED) {
02034           if(aIsFinalChunk)
02035             PostContinueEvent();
02036         }
02037         SetCanInterrupt(PR_FALSE); 
02038 
02039         theIterationIsOk=PRBool((kEOF!=theTokenizerResult) && (result!=NS_ERROR_HTMLPARSER_INTERRUPTED));
02040 
02041        // Make sure not to stop parsing too early. Therefore, before shutting down the 
02042         // parser, it's important to check whether the input buffer has been scanned to 
02043         // completion ( theTokenizerResult should be kEOF ). kEOF -> End of buffer.
02044 
02045         // If we're told to block the parser, we disable all further parsing 
02046         // (and cache any data coming in) until the parser is re-enabled.
02047 
02048         if(NS_ERROR_HTMLPARSER_BLOCK==result) {
02049           //BLOCK == 2152596464
02050           if (mParserContext->mDTD) {
02051             mParserContext->mDTD->WillInterruptParse(mSink);
02052           }
02053           
02054           BlockParser();
02055           return NS_OK;
02056         }
02057         
02058         else if (NS_ERROR_HTMLPARSER_STOPPARSING==result) {
02059           // Note: Parser Terminate() calls DidBuildModel.
02060           if(mInternalState!=NS_ERROR_HTMLPARSER_STOPPARSING) {
02061             DidBuildModel(mStreamStatus);
02062             mInternalState = result;
02063           }
02064           return NS_OK;
02065         }
02066                   
02067         else if(((NS_OK==result) && (theTokenizerResult==kEOF)) || (result==NS_ERROR_HTMLPARSER_INTERRUPTED)){
02068 
02069           PRBool theContextIsStringBased=PRBool(CParserContext::eCTString==mParserContext->mContextType);
02070           if( (eOnStop==mParserContext->mStreamListenerState) || 
02071               (!mParserContext->mMultipart) || theContextIsStringBased) {
02072 
02073             if(!mParserContext->mPrevContext) {
02074               if(eOnStop==mParserContext->mStreamListenerState) {
02075 
02076                 DidBuildModel(mStreamStatus);          
02077 
02078                 MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
02079                 MOZ_TIMER_STOP(mParseTime);
02080 
02081                 MOZ_TIMER_LOG(("Parse Time (this=%p): ", this));
02082                 MOZ_TIMER_PRINT(mParseTime);
02083 
02084                 MOZ_TIMER_LOG(("DTD Time: "));
02085                 MOZ_TIMER_PRINT(mDTDTime);
02086 
02087                 MOZ_TIMER_LOG(("Tokenize Time: "));
02088                 MOZ_TIMER_PRINT(mTokenizeTime);
02089 
02090                 return NS_OK;
02091               }
02092 
02093             }
02094             else { 
02095 
02096               CParserContext* theContext=PopContext();
02097               if(theContext) {
02098                 theIterationIsOk=PRBool(allowIteration && theContextIsStringBased);
02099                 if(theContext->mCopyUnused) {
02100                   theContext->mScanner->CopyUnusedData(mUnusedInput);
02101                 }
02102                 delete theContext;
02103               }
02104               result = mInternalState;  
02105               aIsFinalChunk=(mParserContext && mParserContext->mStreamListenerState==eOnStop)? PR_TRUE:PR_FALSE;
02106               
02107                 //...then intentionally fall through to WillInterruptParse()...
02108             }
02109 
02110           }             
02111 
02112         }
02113 
02114         if((kEOF==theTokenizerResult) || (result==NS_ERROR_HTMLPARSER_INTERRUPTED)) {
02115           result = (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
02116           if (mParserContext->mDTD) {
02117             mParserContext->mDTD->WillInterruptParse(mSink);
02118           }
02119         }
02120 
02121 
02122       }//while
02123     }//if
02124     else {
02125       mInternalState=result=NS_ERROR_HTMLPARSER_UNRESOLVEDDTD;
02126     }
02127   }//if
02128 
02129   MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
02130   MOZ_TIMER_STOP(mParseTime);
02131 
02132   return (result==NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
02133 }
02134 
02143 nsresult nsParser::BuildModel() {
02144   CParserContext* theRootContext = mParserContext;
02145   nsITokenizer*   theTokenizer = 0;
02146 
02147   nsresult result = NS_OK;
02148   if (mParserContext) {
02149     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() : NS_IPARSER_FLAG_HTML;
02150     mParserContext->GetTokenizer(type, mSink, theTokenizer);
02151   }
02152 
02153   if (theTokenizer) {
02154 
02155     //Get the root DTD for use in model building...
02156     while (theRootContext->mPrevContext) {
02157       theRootContext = theRootContext->mPrevContext;
02158     }
02159 
02160     nsIDTD* theRootDTD = theRootContext->mDTD;
02161     if (theRootDTD) {      
02162       MOZ_TIMER_START(mDTDTime);
02163       
02164       result = theRootDTD->BuildModel(this, theTokenizer, nsnull, mSink);  
02165       
02166       MOZ_TIMER_STOP(mDTDTime);
02167     }
02168   }
02169   else{
02170     mInternalState = result = NS_ERROR_HTMLPARSER_BADTOKENIZER;
02171   }
02172   return result;
02173 }
02174 
02175 
02182 nsresult nsParser::GetTokenizer(nsITokenizer*& aTokenizer) {
02183   nsresult result = NS_OK;
02184   aTokenizer = nsnull;
02185   if(mParserContext) {
02186     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() : NS_IPARSER_FLAG_HTML;
02187     result = mParserContext->GetTokenizer(type, mSink, aTokenizer);
02188   }
02189   return result;
02190 }
02191 
02192 /*******************************************************************
02193   These methods are used to talk to the netlib system...
02194  *******************************************************************/
02195 
02196 #ifdef rickgdebug
02197 #include <fstream.h>
02198   fstream* gOutFile;
02199 #endif
02200 
02208 nsresult nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) {
02209 
02210   NS_PRECONDITION(eNone==mParserContext->mStreamListenerState,
02211                   "Parser's nsIStreamListener API was not setup "
02212                   "correctly in constructor.");
02213 
02214   if (mObserver) {
02215     mObserver->OnStartRequest(request, aContext);
02216   }
02217   mParserContext->mStreamListenerState = eOnStart;
02218   mParserContext->mAutoDetectStatus = eUnknownDetect;
02219   mParserContext->mDTD = 0;
02220   mParserContext->mRequest = request;
02221 
02222   nsresult rv;
02223   nsCAutoString contentType;
02224   nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
02225   if (channel) {
02226     rv = channel->GetContentType(contentType);
02227     if (NS_SUCCEEDED(rv)) {
02228       mParserContext->SetMimeType(contentType);
02229     }
02230   }
02231 
02232 #ifdef rickgdebug
02233   gOutFile= new fstream("c:/temp/out.file",ios::trunc);
02234 #endif
02235 
02236   rv = NS_OK;
02237 
02238   if (sParserDataListeners && mSink) {
02239     nsISupports *ctx = mSink->GetTarget();
02240     PRInt32 count = sParserDataListeners->Count();
02241 
02242     while (count--) {
02243       rv |= sParserDataListeners->ObjectAt(count)->
02244         OnStartRequest(request, ctx);
02245     }
02246   }
02247 
02248   return rv;
02249 }
02250 
02251 
02252 #define UTF16_BE "UTF-16BE"
02253 #define UTF16_LE "UTF-16LE"
02254 #define UCS4_BE "UTF-32BE"
02255 #define UCS4_LE "UTF-32LE"
02256 #define UCS4_2143 "X-ISO-10646-UCS-4-2143"
02257 #define UCS4_3412 "X-ISO-10646-UCS-4-3412"
02258 #define UTF8 "UTF-8"
02259 
02260 static inline PRBool IsSecondMarker(unsigned char aChar)
02261 {
02262   switch (aChar) {
02263     case '!':
02264     case '?':
02265     case 'h':
02266     case 'H':
02267       return PR_TRUE;
02268     default:
02269       return PR_FALSE;
02270   }
02271 }
02272 
02273 static PRBool DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsCString& oCharset, PRInt32& oCharsetSource) {
02274  oCharsetSource= kCharsetFromAutoDetection;
02275  oCharset.Truncate();
02276  // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
02277  // for details
02278  // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
02279  // We need to check that
02280  // UCS2 BOM FEFF = UTF8 EF BB BF
02281  switch(aBytes[0])
02282         {
02283    case 0x00:
02284      if(0x00==aBytes[1]) {
02285         // 00 00
02286         if((0xFE==aBytes[2]) && (0xFF==aBytes[3])) {
02287            // 00 00 FE FF UCS-4, big-endian machine (1234 order)
02288            oCharset.Assign(UCS4_BE);
02289         } else if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
02290            // 00 00 00 3C UCS-4, big-endian machine (1234 order)
02291            oCharset.Assign(UCS4_BE);
02292         } else if((0xFF==aBytes[2]) && (0xFE==aBytes[3])) {
02293            // 00 00 FF FE UCS-4, unusual octet order (2143)
02294            oCharset.Assign(UCS4_2143);
02295         } else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
02296            // 00 00 3C 00 UCS-4, unusual octet order (2143)
02297            oCharset.Assign(UCS4_2143);
02298         } 
02299         oCharsetSource = kCharsetFromByteOrderMark;
02300      } else if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
02301         // 00 3C 00
02302         if(IsSecondMarker(aBytes[3])) {
02303            // 00 3C 00 SM UTF-16,  big-endian, no Byte Order Mark 
02304            oCharset.Assign(UTF16_BE); 
02305         } else if((0x00==aBytes[3])) {
02306            // 00 3C 00 00 UCS-4, unusual octet order (3412)
02307            oCharset.Assign(UCS4_3412);
02308         } 
02309         oCharsetSource = kCharsetFromByteOrderMark;
02310      }
02311    break;
02312    case 0x3C:
02313      if(0x00==aBytes[1] && (0x00==aBytes[3])) {
02314         // 3C 00 XX 00
02315         if(IsSecondMarker(aBytes[2])) {
02316            // 3C 00 SM 00 UTF-16,  little-endian, no Byte Order Mark 
02317            oCharset.Assign(UTF16_LE); 
02318         } else if((0x00==aBytes[2])) {
02319            // 3C 00 00 00 UCS-4, little-endian machine (4321 order)
02320            oCharset.Assign(UCS4_LE); 
02321         } 
02322         oCharsetSource = kCharsetFromByteOrderMark;
02323      // For html, meta tag detector is invoked before this so that we have 
02324      // to deal only with XML here.
02325      } else if(                     (0x3F==aBytes[1]) &&
02326                (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
02327                (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
02328        // 3C 3F 78 6D
02329        // ASCII characters are in their normal positions, so we can safely
02330        // deal with the XML declaration in the old C way
02331        // XXX This part could be made simpler by using CWordTokenizer<char>,
02332        //     but bug 104479 must be fixed first.
02333        // The shortest string so far (strlen==5):
02334        // <?xml
02335        PRInt32 i;
02336        PRBool versionFound = PR_FALSE, encodingFound = PR_FALSE;
02337        for (i=6; i < aLen && !encodingFound; ++i) {
02338          // end of XML declaration?
02339          if ((((char*)aBytes)[i] == '?') && 
02340            ((i+1) < aLen) &&
02341            (((char*)aBytes)[i+1] == '>')) {
02342            break;
02343          }
02344          // Version is required.
02345          if (!versionFound) {
02346            // Want to avoid string comparisons, hence looking for 'n'
02347            // and only if found check the string leading to it. Not
02348            // foolproof, but fast.
02349            // The shortest string allowed before this is  (strlen==13):
02350            // <?xml version
02351            if ((((char*)aBytes)[i] == 'n') &&
02352              (i >= 12) && 
02353              (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
02354              // Fast forward through version
02355              char q = 0;
02356              for (++i; i < aLen; ++i) {
02357                char qi = ((char*)aBytes)[i];
02358                if (qi == '\'' || qi == '"') {
02359                  if (q && q == qi) {
02360                    //  ending quote
02361                    versionFound = PR_TRUE;
02362                    break;
02363                  } else {
02364                    // Starting quote
02365                    q = qi;
02366                  }
02367                }
02368              }
02369            }
02370          } else {
02371            // encoding must follow version
02372            // Want to avoid string comparisons, hence looking for 'g'
02373            // and only if found check the string leading to it. Not
02374            // foolproof, but fast.
02375            // The shortest allowed string before this (strlen==26):
02376            // <?xml version="1" encoding
02377            if ((((char*)aBytes)[i] == 'g') &&
02378              (i >= 25) && 
02379              (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
02380              PRInt32 encStart = 0;
02381              char q = 0;
02382              for (++i; i < aLen; ++i) {
02383                char qi = ((char*)aBytes)[i];
02384                if (qi == '\'' || qi == '"') {
02385                  if (q && q == qi) {
02386                    PRInt32 count = i - encStart;
02387                    // encoding value is invalid if it is UTF-16
02388                    if (count > 0 && 
02389                      (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
02390                      oCharset.Assign((char*)(aBytes+encStart),count);
02391                      oCharsetSource = kCharsetFromMetaTag;
02392                    }
02393                    encodingFound = PR_TRUE;
02394                    break;
02395                  } else {
02396                    encStart = i+1;
02397                    q = qi;
02398                  }
02399                }
02400              }
02401            }
02402          } // if (!versionFound)
02403        } // for
02404      }
02405    break;
02406    case 0xEF:  
02407      if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
02408         // EF BB BF
02409         // Win2K UTF-8 BOM
02410         oCharset.Assign(UTF8); 
02411         oCharsetSource= kCharsetFromByteOrderMark;
02412      }
02413    break;
02414    case 0xFE:
02415      if(0xFF==aBytes[1]) {
02416         if(0x00==aBytes[2] && 0x00==aBytes[3]) {
02417           // FE FF 00 00  UCS-4, unusual octet order (3412)
02418           oCharset.Assign(UCS4_3412);
02419         } else {
02420           // FE FF UTF-16, big-endian 
02421           oCharset.Assign(UTF16_BE); 
02422         }
02423         oCharsetSource= kCharsetFromByteOrderMark;
02424      }
02425    break;
02426    case 0xFF:
02427      if(0xFE==aBytes[1]) {
02428         if(0x00==aBytes[2] && 0x00==aBytes[3]) 
02429          // FF FE 00 00  UTF-32, little-endian
02430            oCharset.Assign(UCS4_LE); 
02431         else
02432         // FF FE
02433         // UTF-16, little-endian 
02434            oCharset.Assign(UTF16_LE); 
02435         oCharsetSource= kCharsetFromByteOrderMark;
02436      }
02437    break;
02438    // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
02439    //   We do not care EBCIDIC here....
02440    // }
02441    // break;
02442  }  // switch
02443  return !oCharset.IsEmpty();
02444 }
02445 
02446 inline const char GetNextChar(nsACString::const_iterator& aStart,
02447                               nsACString::const_iterator& aEnd)
02448 {
02449   NS_ASSERTION(aStart != aEnd, "end of buffer");
02450   return (++aStart != aEnd) ? *aStart : '\0';
02451 }
02452 
02453 PRBool 
02454 nsParser::DetectMetaTag(const char* aBytes, 
02455                         PRInt32 aLen, 
02456                         nsCString& aCharset, 
02457                         PRInt32& aCharsetSource) 
02458 {
02459   aCharsetSource= kCharsetFromMetaTag;
02460   aCharset.SetLength(0);
02461 
02462   // XXX Only look inside HTML documents for now. For XML
02463   // documents we should be looking inside the XMLDecl.
02464   if (!mParserContext->mMimeType.EqualsLiteral(kHTMLTextContentType)) {
02465     return PR_FALSE;
02466   }
02467 
02468   // Fast and loose parsing to determine if we have a complete
02469   // META tag in this block, looking upto 2k into it.
02470   const nsASingleFragmentCString& str =
02471       Substring(aBytes, aBytes + PR_MIN(aLen, 2048));
02472   // XXXldb Should be const_char_iterator when FindInReadable supports it.
02473   nsACString::const_iterator begin, end;
02474   
02475   str.BeginReading(begin);
02476   str.EndReading(end);
02477   nsACString::const_iterator currPos(begin);
02478   nsACString::const_iterator tokEnd;
02479   nsACString::const_iterator tagEnd(begin);
02480   
02481   while (currPos != end) {
02482     if (!FindCharInReadable('<', currPos, end)) 
02483       break; // no tag found in this buffer
02484 
02485     if (GetNextChar(currPos, end) == '!' && 
02486         GetNextChar(currPos, end) == '-' &&
02487         GetNextChar(currPos, end) == '-') {
02488       // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
02489       PRBool foundMDC = PR_FALSE;
02490       PRBool foundMatch = PR_FALSE; 
02491       while (!foundMDC) {
02492         if (GetNextChar(currPos, end) == '-' && 
02493             GetNextChar(currPos, end) == '-') {
02494           foundMatch = !foundMatch; // toggle until we've matching "--"
02495         }
02496         else if (currPos == end) {
02497           return PR_FALSE; // Couldn't find --[*s]> in this buffer
02498         }
02499         else if (foundMatch && *currPos == '>') {
02500           foundMDC = PR_TRUE; // found comment end delimiter.
02501           ++currPos;
02502         }
02503       }
02504       continue; // continue searching for META tag.
02505     }
02506 
02507     // Find the end of the tag, break if incomplete
02508     tagEnd = currPos;
02509     if (!FindCharInReadable('>', tagEnd, end))
02510       break;
02511 
02512     // If this is not a META tag, continue to next loop
02513     if ( (*currPos != 'm' && *currPos != 'M') ||
02514          (*(++currPos) != 'e' && *currPos != 'E') ||
02515          (*(++currPos) != 't' && *currPos != 'T') ||
02516          (*(++currPos) != 'a' && *currPos != 'A') ) {
02517       currPos = tagEnd;
02518       continue;
02519     }
02520 
02521     // If could not find "charset" in this tag, skip this tag and try next
02522     tokEnd = tagEnd;
02523     if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"), currPos, tokEnd)) {
02524       currPos = tagEnd;
02525       continue;
02526     }
02527     currPos = tokEnd;
02528 
02529     // skip spaces before '='
02530     while (*currPos == kSpace || *currPos == kNewLine ||
02531            *currPos == kCR || *currPos == kTab)  
02532       ++currPos;
02533     // skip '='
02534     if (*currPos != '=') {
02535       currPos = tagEnd;
02536       continue;
02537     }
02538     ++currPos;
02539     // skip spaces after '='
02540     while (*currPos == kSpace || *currPos == kNewLine ||
02541            *currPos == kCR || *currPos == kTab)  
02542       ++currPos;
02543           
02544     // skip open quote
02545     if ((*currPos == '\'' || *currPos == '\"'))
02546       ++currPos;
02547 
02548     // find the end of charset string
02549     tokEnd = currPos;
02550     while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd) 
02551       ++tokEnd;
02552 
02553     // return true if we successfully got something for charset
02554     if (currPos != tokEnd) {
02555       aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
02556       return PR_TRUE;
02557     } 
02558     
02559     //nothing specified as charset, continue next loop
02560     currPos = tagEnd;
02561   } 
02562   
02563   return PR_FALSE;
02564 }
02565 
02566 typedef struct {
02567   PRBool mNeedCharsetCheck;
02568   nsParser* mParser;
02569   nsIParserFilter* mParserFilter;
02570   nsScanner* mScanner;
02571   nsIRequest* mRequest;
02572 } ParserWriteStruct;
02573 
02574 /*
02575  * This function is invoked as a result of a call to a stream's
02576  * ReadSegments() method. It is called for each contiguous buffer
02577  * of data in the underlying stream or pipe. Using ReadSegments
02578  * allows us to avoid copying data to read out of the stream.
02579  */
02580 static NS_METHOD
02581 ParserWriteFunc(nsIInputStream* in,
02582                 void* closure,
02583                 const char* fromRawSegment,
02584                 PRUint32 toOffset,
02585                 PRUint32 count,
02586                 PRUint32 *writeCount)
02587 {
02588   nsresult result;
02589   ParserWriteStruct* pws = NS_STATIC_CAST(ParserWriteStruct*, closure);
02590   const char* buf = fromRawSegment;
02591   PRUint32 theNumRead = count;
02592 
02593   if (!pws) {
02594     return NS_ERROR_FAILURE;
02595   }
02596 
02597   if (pws->mNeedCharsetCheck) {
02598     PRInt32 guessSource;
02599     nsCAutoString guess;
02600     nsCAutoString preferred;
02601 
02602     pws->mNeedCharsetCheck = PR_FALSE;
02603     if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
02604         ((count >= 4) &&
02605          DetectByteOrderMark((const unsigned char*)buf,
02606                              theNumRead, guess, guessSource))) {
02607       nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
02608       result = alias->GetPreferred(guess, preferred);
02609       // Only continue if it's a recognized charset and not
02610       // one of a designated set that we ignore.
02611       if (NS_SUCCEEDED(result) &&
02612           ((kCharsetFromByteOrderMark == guessSource) ||
02613            (!preferred.EqualsLiteral("UTF-16") &&
02614             !preferred.EqualsLiteral("UTF-16BE") &&
02615             !preferred.EqualsLiteral("UTF-16LE") &&
02616             !preferred.EqualsLiteral("UTF-32BE") &&
02617             !preferred.EqualsLiteral("UTF-32LE")))) {
02618         guess = preferred;
02619         pws->mParser->SetDocumentCharset(guess, guessSource);
02620         pws->mParser->SetSinkCharset(preferred);
02621         nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
02622         if (channel) {
02623           nsCOMPtr<nsISupports> cacheToken;
02624           channel->GetCacheToken(getter_AddRefs(cacheToken));
02625           if (cacheToken) {
02626             nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
02627             if (cacheDescriptor) {
02628 #ifdef DEBUG
02629               nsresult rv =
02630 #endif
02631                 cacheDescriptor->SetMetaDataElement("charset",
02632                                                     guess.get());
02633               NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
02634             }
02635           }
02636         }
02637       }
02638     }
02639   }
02640 
02641   if (pws->mParserFilter)
02642     pws->mParserFilter->RawBuffer(buf, &theNumRead);
02643 
02644   result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
02645   if (NS_SUCCEEDED(result)) {
02646     *writeCount = count;
02647   }
02648 
02649   return result;
02650 }
02651 
02661 nsresult nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
02662                                    nsIInputStream *pIStream,
02663                                    PRUint32 sourceOffset, PRUint32 aLength)
02664 {
02665   NS_PRECONDITION((eOnStart == mParserContext->mStreamListenerState ||
02666                    eOnDataAvail == mParserContext->mStreamListenerState),
02667             "Error: OnStartRequest() must be called before OnDataAvailable()");
02668   NS_PRECONDITION(NS_InputStreamIsBuffered(pIStream),
02669                   "Must have a buffered input stream");
02670 
02671   nsresult rv = NS_OK;
02672 
02673   CParserContext *theContext=mParserContext;
02674 
02675   while (theContext) {
02676     if (theContext->mRequest != request && theContext->mPrevContext)
02677       theContext = theContext->mPrevContext;
02678     else break;
02679   }
02680 
02681   if (theContext && theContext->mRequest == request) {
02682 
02683     theContext->mStreamListenerState = eOnDataAvail;
02684 
02685     if (eInvalidDetect == theContext->mAutoDetectStatus) {
02686       if (theContext->mScanner) {
02687         nsScannerIterator iter;
02688         theContext->mScanner->EndReading(iter);
02689         theContext->mScanner->SetPosition(iter, PR_TRUE);
02690       }
02691     }
02692 
02693     PRUint32 totalRead;
02694     ParserWriteStruct pws;
02695     pws.mNeedCharsetCheck =
02696       (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
02697     pws.mParser = this;
02698     pws.mParserFilter = mParserFilter;
02699     pws.mScanner = theContext->mScanner;
02700     pws.mRequest = request;
02701 
02702     rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
02703     if (NS_FAILED(rv)) {
02704       return rv;
02705     }
02706 
02707     // Don't bother to start parsing until we've seen some
02708     // non-whitespace data
02709     if (theContext->mScanner->FirstNonWhitespacePosition() >= 0) {
02710       rv = ResumeParse();
02711     }
02712   }
02713   else {
02714     rv = NS_ERROR_UNEXPECTED;
02715   }
02716 
02717   return rv;
02718 }
02719 
02728 nsresult nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext,
02729                                  nsresult status)
02730 {  
02731 
02732   nsresult rv = NS_OK;
02733 
02734   if (eOnStart == mParserContext->mStreamListenerState) {
02735     //If you're here, then OnDataAvailable() never got called.  Prior
02736     //to necko, we never dealt with this case, but the problem may
02737     //have existed.  Everybody can live with an empty input stream, so
02738     //just resume parsing.
02739     rv = ResumeParse(PR_TRUE, PR_TRUE);    
02740   }
02741 
02742   CParserContext *pc = mParserContext;
02743   while (pc) {
02744     if (pc->mRequest == request) {
02745       pc->mStreamListenerState = eOnStop;
02746       pc->mScanner->SetIncremental(PR_FALSE);
02747       break;
02748     }
02749 
02750     pc = pc->mPrevContext;
02751   }
02752 
02753   mStreamStatus = status;
02754 
02755   if (mParserFilter)
02756     mParserFilter->Finish();
02757 
02758   if (NS_SUCCEEDED(rv)) {
02759     rv = ResumeParse(PR_TRUE, PR_TRUE);
02760   }
02761 
02762   // If the parser isn't enabled, we don't finish parsing till
02763   // it is reenabled.
02764 
02765 
02766   // XXX Should we wait to notify our observers as well if the
02767   // parser isn't yet enabled?
02768   if (mObserver) {
02769     mObserver->OnStopRequest(request, aContext, status);
02770   }
02771 
02772 #ifdef rickgdebug
02773   if(gOutFile){
02774     gOutFile->close();
02775     delete gOutFile;
02776     gOutFile = 0;
02777   }
02778 #endif
02779 
02780   if (sParserDataListeners && mSink) {
02781     nsISupports *ctx = mSink->GetTarget();
02782     PRInt32 count = sParserDataListeners->Count();
02783 
02784     while (count--) {
02785       rv |= sParserDataListeners->ObjectAt(count)->OnStopRequest(request, ctx,
02786                                                                  status);
02787     }
02788   }
02789 
02790   return rv;
02791 }
02792 
02793 
02794 /*******************************************************************
02795   Here comes the tokenization methods...
02796  *******************************************************************/
02797 
02798 
02808 PRBool nsParser::WillTokenize(PRBool aIsFinalChunk){
02809   nsITokenizer* theTokenizer=0;
02810   nsresult result = NS_OK;
02811   if (mParserContext) {
02812     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() : NS_IPARSER_FLAG_HTML;
02813     mParserContext->GetTokenizer(type, mSink, theTokenizer);
02814   }
02815 
02816   if (theTokenizer) {
02817     result = theTokenizer->WillTokenize(aIsFinalChunk,&mTokenAllocator);
02818   }  
02819   return result;
02820 }
02821 
02822 
02831 nsresult nsParser::Tokenize(PRBool aIsFinalChunk){
02832   
02833   nsITokenizer* theTokenizer = 0;
02834     
02835   nsresult result = NS_OK;
02836 
02837   if (mParserContext) {
02838     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() : NS_IPARSER_FLAG_HTML;
02839     mParserContext->GetTokenizer(type, mSink, theTokenizer);
02840   }
02841 
02842   if (theTokenizer) { 
02843     if (mFlags & NS_PARSER_FLAG_FLUSH_TOKENS) {
02844       // For some reason tokens didn't get flushed ( probably
02845       // the parser got blocked before all the tokens in the
02846       // stack got handled ). Flush 'em now. Ref. bug 104856
02847       if (theTokenizer->GetCount() == 0) {
02848         mFlags &= ~NS_PARSER_FLAG_FLUSH_TOKENS; // reset since the tokens have been flushed.
02849         // Resume tokenization for the rest of the document 
02850         // since all the tokens in the tokenizer got flushed.
02851         result = Tokenize(aIsFinalChunk); 
02852       }
02853     }
02854     else {
02855       PRBool flushTokens=PR_FALSE;
02856 
02857       MOZ_TIMER_START(mTokenizeTime);
02858 
02859       WillTokenize(aIsFinalChunk);
02860       while (NS_SUCCEEDED(result)) {
02861         mParserContext->mScanner->Mark();
02862         result=theTokenizer->ConsumeToken(*mParserContext->mScanner, flushTokens);
02863         if (NS_FAILED(result)) {
02864           mParserContext->mScanner->RewindToMark();
02865           if (kEOF == result){
02866             break;
02867           }
02868           else if(NS_ERROR_HTMLPARSER_STOPPARSING==result) {
02869             result = Terminate();
02870             break;
02871           }
02872         }
02873         else if (flushTokens && (mFlags & NS_PARSER_FLAG_OBSERVERS_ENABLED)) {
02874           // I added the extra test of NS_PARSER_FLAG_OBSERVERS_ENABLED to fix Bug# 23931.
02875           // Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 --
02876           // Also remember to update the marked position.
02877           mFlags |= NS_PARSER_FLAG_FLUSH_TOKENS;
02878           mParserContext->mScanner->Mark();
02879           break;
02880         }
02881       } 
02882       DidTokenize(aIsFinalChunk);
02883 
02884       MOZ_TIMER_STOP(mTokenizeTime);
02885     }  
02886   }
02887   else{
02888     result = mInternalState = NS_ERROR_HTMLPARSER_BADTOKENIZER;
02889   }
02890   
02891   return result;
02892 }
02893 
02903 PRBool nsParser::DidTokenize(PRBool aIsFinalChunk){
02904   PRBool result=PR_TRUE;
02905 
02906   nsITokenizer* theTokenizer=0;
02907   nsresult rv = NS_OK;
02908   if (mParserContext) {
02909     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() : NS_IPARSER_FLAG_HTML;
02910     mParserContext->GetTokenizer(type, mSink, theTokenizer);
02911   }
02912 
02913   if (NS_SUCCEEDED(rv) && theTokenizer) {
02914     result = theTokenizer->DidTokenize(aIsFinalChunk);
02915   }
02916   return result;
02917 }
02918 
02925 NS_IMETHODIMP 
02926 nsParser::GetChannel(nsIChannel** aChannel)
02927 {
02928   nsresult result = NS_ERROR_NOT_AVAILABLE;
02929   if (mParserContext && mParserContext->mRequest)
02930     result = CallQueryInterface(mParserContext->mRequest, aChannel);
02931   return result;
02932 }
02933 
02940 NS_IMETHODIMP 
02941 nsParser::GetDTD(nsIDTD** aDTD)
02942 {
02943   if (mParserContext) {
02944     *aDTD = mParserContext->mDTD;
02945     NS_IF_ADDREF(mParserContext->mDTD);
02946   }
02947   
02948   return NS_OK;
02949 }
02950