Back to index

lightning-sunbird  0.9+nobinonly
nsWebCrawler.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
00002  *
00003  * ***** BEGIN LICENSE BLOCK *****
00004  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00005  *
00006  * The contents of this file are subject to the Mozilla Public License Version
00007  * 1.1 (the "License"); you may not use this file except in compliance with
00008  * the License. You may obtain a copy of the License at
00009  * http://www.mozilla.org/MPL/
00010  *
00011  * Software distributed under the License is distributed on an "AS IS" basis,
00012  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00013  * for the specific language governing rights and limitations under the
00014  * License.
00015  *
00016  * The Original Code is Mozilla Communicator client code.
00017  *
00018  * The Initial Developer of the Original Code is
00019  * Netscape Communications Corporation.
00020  * Portions created by the Initial Developer are Copyright (C) 1998
00021  * the Initial Developer. All Rights Reserved.
00022  *
00023  * Contributor(s):
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK *****
00038  * This Original Code has been modified by IBM Corporation. Modifications made by IBM 
00039  * described herein are Copyright (c) International Business Machines Corporation, 2000.
00040  * Modifications to Mozilla code or documentation identified per MPL Section 3.3
00041  *
00042  * Date             Modified by     Description of modification
00043  * 04/20/2000       IBM Corp.      OS/2 VisualAge build.
00044  */
00045 #include "nscore.h"
00046 #include "nsCOMPtr.h"
00047 #include "nsWebCrawler.h"
00048 #include "nsViewerApp.h"
00049 #include "nsIContentViewer.h"
00050 #include "nsIDocumentViewer.h"
00051 #include "nsIDocument.h"
00052 #include "nsIContent.h"
00053 #include "nsIPresShell.h"
00054 #include "nsPresContext.h"
00055 #include "nsIViewManager.h"
00056 #include "nsIFrame.h"
00057 #include "nsIFrameDebug.h"
00058 #include "nsIURL.h"
00059 #include "nsNetUtil.h"
00060 #include "nsITimer.h"
00061 #include "nsIAtom.h"
00062 #include "nsIFrameUtil.h"
00063 #include "nsIComponentManager.h"
00064 #include "nsLayoutCID.h"
00065 #include "nsRect.h"
00066 #include "plhash.h"
00067 #include "nsINameSpaceManager.h"
00068 #include "nsXPIDLString.h"
00069 #include "nsReadableUtils.h"
00070 #include "nsIServiceManager.h"
00071 #include "nsIEventQueueService.h"
00072 #include "nsIEventQueue.h"
00073 #include "prprf.h"
00074 #include "nsIContentViewer.h"
00075 #include "nsIContentViewerFile.h"
00076 #include "nsIDocShell.h"
00077 #include "nsIWebNavigation.h"
00078 #include "nsIWebProgress.h"
00079 
00080 static NS_DEFINE_IID(kFrameUtilCID, NS_FRAME_UTIL_CID);
00081 
00082 static PLHashNumber
00083 HashKey(nsIAtom* key)
00084 {
00085   return NS_PTR_TO_INT32(key);
00086 }
00087 
00088 static PRIntn
00089 CompareKeys(nsIAtom* key1, nsIAtom* key2)
00090 {
00091   return key1 == key2;
00092 }
00093 
00094 class AtomHashTable {
00095 public:
00096   AtomHashTable();
00097   ~AtomHashTable();
00098 
00099   const void* Get(nsIAtom* aKey);
00100   const void* Put(nsIAtom* aKey, const void* aValue);
00101   const void* Remove(nsIAtom* aKey);
00102 
00103 protected:
00104   PLHashTable* mTable;
00105 };
00106 
00107 AtomHashTable::AtomHashTable()
00108 {
00109   mTable = PL_NewHashTable(8, (PLHashFunction) HashKey,
00110                            (PLHashComparator) CompareKeys,
00111                            (PLHashComparator) nsnull,
00112                            nsnull, nsnull);
00113 }
00114 
00115 static PRIntn PR_CALLBACK
00116 DestroyEntry(PLHashEntry *he, PRIntn i, void *arg)
00117 {
00118   ((nsIAtom*)he->key)->Release();
00119   return HT_ENUMERATE_NEXT;
00120 }
00121 
00122 AtomHashTable::~AtomHashTable()
00123 {
00124   PL_HashTableEnumerateEntries(mTable, DestroyEntry, 0);
00125   PL_HashTableDestroy(mTable);
00126 }
00127 
00131 const void*
00132 AtomHashTable::Get(nsIAtom* aKey)
00133 {
00134   PRInt32 hashCode = NS_PTR_TO_INT32(aKey);
00135   PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
00136   PLHashEntry* he = *hep;
00137   if (nsnull != he) {
00138     return he->value;
00139   }
00140   return nsnull;
00141 }
00142 
00148 const void*
00149 AtomHashTable::Put(nsIAtom* aKey, const void* aData)
00150 {
00151   PRInt32 hashCode = NS_PTR_TO_INT32(aKey);
00152   PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
00153   PLHashEntry* he = *hep;
00154   if (nsnull != he) {
00155     const void* oldValue = he->value;
00156     he->value = NS_CONST_CAST(void*, aData);
00157     return oldValue;
00158   }
00159   NS_ADDREF(aKey);
00160   PL_HashTableRawAdd(mTable, hep, hashCode, aKey, NS_CONST_CAST(void*, aData));
00161   return nsnull;
00162 }
00163 
00168 const void*
00169 AtomHashTable::Remove(nsIAtom* aKey)
00170 {
00171   PRInt32 hashCode = NS_PTR_TO_INT32(aKey);
00172   PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
00173   PLHashEntry* he = *hep;
00174   void* oldValue = nsnull;
00175   if (nsnull != he) {
00176     oldValue = he->value;
00177     PL_HashTableRawRemove(mTable, hep, he);
00178   }
00179   return oldValue;
00180 }
00181 
00182 //----------------------------------------------------------------------
00183 
00184 nsWebCrawler::nsWebCrawler(nsViewerApp* aViewer)
00185   : mHaveURLList(PR_FALSE),
00186     mQueuedLoadURLs(0)
00187 {
00188   mBrowser = nsnull;
00189   mViewer = aViewer;
00190   mCrawl = PR_FALSE;
00191   mJiggleLayout = PR_FALSE;
00192   mPostExit = PR_FALSE;
00193   mDelay = 200 /*msec*/; // XXXwaterson straigt outta my arse
00194   mMaxPages = -1;
00195   mRecord = nsnull;
00196   mLinkTag = do_GetAtom("a");
00197   mFrameTag = do_GetAtom("frame");
00198   mIFrameTag = do_GetAtom("iframe");
00199   mHrefAttr = do_GetAtom("href");
00200   mSrcAttr = do_GetAtom("src");
00201   mBaseHrefAttr = do_GetAtom("_base_href");
00202   mVisited = new AtomHashTable();
00203   mVerbose = nsnull;
00204   LL_I2L(mStartLoad, 0);
00205   mRegressing = PR_FALSE;
00206   mPrinterTestType = 0;
00207   mRegressionOutputLevel = 0;     // full output
00208   mIncludeStyleInfo = PR_TRUE;
00209 }
00210 
00211 static void FreeStrings(nsVoidArray& aArray)
00212 {
00213   PRInt32 i, n = aArray.Count();
00214   for (i = 0; i < n; i++) {
00215     nsString* s = (nsString*) aArray.ElementAt(i);
00216     delete s;
00217   }
00218   aArray.Clear();
00219 }
00220 
00221 nsWebCrawler::~nsWebCrawler()
00222 {
00223   FreeStrings(mSafeDomains);
00224   FreeStrings(mAvoidDomains);
00225   NS_IF_RELEASE(mBrowser);
00226   delete mVisited;
00227 }
00228 
00229 NS_IMPL_ISUPPORTS2(nsWebCrawler, 
00230                    nsIWebProgressListener,
00231                    nsISupportsWeakReference)
00232 
00233 void
00234 nsWebCrawler::DumpRegressionData()
00235 {
00236 #ifdef NS_DEBUG
00237   nsCOMPtr<nsIDocShell> docshell;
00238   mBrowser->GetDocShell(*getter_AddRefs(docshell));
00239   if (! docshell)
00240     return;
00241 
00242   if (mOutputDir.Length() > 0) {
00243     nsCOMPtr<nsIPresShell> shell = GetPresShell(docshell);
00244     if (!shell) return;
00245     if ( mPrinterTestType > 0 ) {
00246       nsCOMPtr <nsIContentViewer> viewer;
00247       docshell->GetContentViewer(getter_AddRefs(viewer));
00248 
00249       if (viewer){
00250         nsCOMPtr<nsIContentViewerFile> viewerFile = do_QueryInterface(viewer);
00251         if (viewerFile) {
00252           nsAutoString regressionFileName;
00253           FILE *fp = GetOutputFile(mLastURL, regressionFileName);
00254 
00255           switch (mPrinterTestType) {
00256           case 1:
00257             // dump print data to a file for regression testing
00258             viewerFile->Print(PR_TRUE, fp, nsnull);
00259             break;
00260           case 2:
00261             // visual printing tests, all go to the printer, no printer dialog
00262             viewerFile->Print(PR_TRUE, nsnull, nsnull);
00263             break;
00264           case 3:
00265             // visual printing tests, all go to the printer, with a printer dialog
00266             viewerFile->Print(PR_FALSE, nsnull, nsnull);
00267             break;
00268           default:
00269             break;
00270           }
00271           fclose(fp);
00272           if( mPrinterTestType == 1) {
00273             if (mRegressing) {
00274               PerformRegressionTest(regressionFileName);
00275             }
00276             else {
00277               fputs(NS_LossyConvertUCS2toASCII(regressionFileName).get(),
00278                     stdout);
00279               printf(" - being written\n");
00280             }
00281           }
00282         }
00283       }
00284     } 
00285     else {
00286       nsIFrame* root = shell->GetRootFrame();
00287       if (nsnull != root) {
00288         nsPresContext *presContext = shell->GetPresContext();
00289         
00290         if (mOutputDir.Length() > 0) {
00291           nsAutoString regressionFileName;
00292           FILE *fp = GetOutputFile(mLastURL, regressionFileName);
00293           if (fp) {
00294             nsIFrameDebug* fdbg;
00295             if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
00296               fdbg->DumpRegressionData(presContext, fp, 0, mIncludeStyleInfo);
00297             }
00298             fclose(fp);
00299             if (mRegressing) {
00300               PerformRegressionTest(regressionFileName);
00301             }
00302             else {
00303               fputs(NS_LossyConvertUCS2toASCII(regressionFileName).get(),
00304                     stdout);
00305               printf(" - being written\n");
00306             }
00307           }
00308           else {
00309             nsCAutoString file;
00310             (void)mLastURL->GetPath(file);
00311             printf("could not open output file for %s\n", file.get());
00312           }
00313         }
00314         else {
00315           nsIFrameDebug* fdbg;
00316           if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
00317             fdbg->DumpRegressionData(presContext, stdout, 0, mIncludeStyleInfo);
00318           }
00319         }
00320       }
00321     }
00322   }
00323 #endif
00324 }
00325 
00326 void
00327 nsWebCrawler::LoadNextURLCallback(nsITimer *aTimer, void *aClosure)
00328 {
00329   nsWebCrawler* self = (nsWebCrawler*) aClosure;
00330 
00331   // if we are doing printing regression tests, check to see 
00332   // if we can print (a previous job is not printing)
00333   if (self->mPrinterTestType > 0) {
00334     nsCOMPtr<nsIDocShell> docShell;
00335     self->mBrowser->GetDocShell(*getter_AddRefs(docShell));
00336     if (docShell){
00337       nsCOMPtr <nsIContentViewer> viewer;
00338       docShell->GetContentViewer(getter_AddRefs(viewer));
00339       if (viewer){
00340         nsCOMPtr<nsIContentViewerFile> viewerFile = do_QueryInterface(viewer);
00341         if (viewerFile) {
00342           PRBool printable;
00343           viewerFile->GetPrintable(&printable);
00344           if (PR_TRUE !=printable){
00345             self->mTimer = do_CreateInstance("@mozilla.org/timer;1");
00346             self->mTimer->InitWithFuncCallback(LoadNextURLCallback, self, self->mDelay,
00347                                                nsITimer::TYPE_ONE_SHOT);
00348             return;
00349           }
00350         }
00351       }
00352     }
00353   }
00354 
00355   self->DumpRegressionData();
00356   self->LoadNextURL(PR_FALSE);
00357 }
00358 
00359 void
00360 nsWebCrawler::QueueExitCallback(nsITimer *aTimer, void *aClosure)
00361 {
00362   nsWebCrawler* self = (nsWebCrawler*) aClosure;
00363   self->DumpRegressionData();
00364   self->QueueExit();
00365 }
00366 
00367 // nsIWebProgressListener implementation 
00368 NS_IMETHODIMP
00369 nsWebCrawler::OnStateChange(nsIWebProgress* aWebProgress, 
00370                             nsIRequest* aRequest, 
00371                             PRUint32 progressStateFlags, 
00372                             nsresult aStatus)
00373 {
00374   // Make sure that we're being notified for _our_ shell, and not some
00375   // subshell that's been created e.g. for an IFRAME.
00376   nsCOMPtr<nsIDocShell> docShell;
00377   mBrowser->GetDocShell(*getter_AddRefs(docShell));
00378   if (docShell) {
00379     nsCOMPtr<nsIWebProgress> progress = do_GetInterface(docShell);
00380     if (aWebProgress != progress)
00381       return NS_OK;
00382   }
00383 
00384   // Make sure that we're being notified for the whole document, not a
00385   // sub-load.
00386   if (! (progressStateFlags & nsIWebProgressListener::STATE_IS_DOCUMENT))
00387     return NS_OK;
00388 
00389   if (progressStateFlags & nsIWebProgressListener::STATE_START) {
00390     // If the document load is starting, remember its URL as the last
00391     // URL we've loaded.
00392     nsCOMPtr<nsIChannel> channel(do_QueryInterface(aRequest));
00393     if (! channel) {
00394       NS_ERROR("no channel avail");
00395       return NS_ERROR_FAILURE;
00396     }
00397 
00398     nsCOMPtr<nsIURI> uri;
00399     channel->GetURI(getter_AddRefs(uri));
00400 
00401     mLastURL = uri;
00402   }
00403   //XXXwaterson are these really _not_ mutually exclusive?
00404   // else
00405   if ((progressStateFlags & nsIWebProgressListener::STATE_STOP) && (aStatus == NS_OK)) {
00406     // If the document load is finishing, then wrap up and maybe load
00407     // some more URLs.
00408     nsresult rv;
00409     PRTime endLoadTime = PR_Now();
00410 
00411     nsCOMPtr<nsIURI> uri;
00412     nsCOMPtr<nsIChannel> channel = do_QueryInterface(aRequest);
00413     rv = channel->GetURI(getter_AddRefs(uri));
00414     if (NS_FAILED(rv)) return rv;
00415 
00416     // Ignore this notification unless its for the current url. That way
00417     // we skip over embedded webshell notifications (e.g. frame cells,
00418     // iframes, etc.)
00419     nsCAutoString spec;
00420     uri->GetSpec(spec);
00421 
00422     PRTime delta, cvt, rounder;
00423     LL_I2L(cvt, 1000);
00424     LL_I2L(rounder, 499);
00425     LL_SUB(delta, endLoadTime, mStartLoad);
00426     LL_ADD(delta, delta, rounder);
00427     LL_DIV(delta, delta, cvt);
00428     printf("+++ %s: done loading (%lld msec)\n", spec.get(), delta);
00429 
00430     // Make sure the document bits make it to the screen at least once
00431     nsCOMPtr<nsIPresShell> shell = GetPresShell();
00432     if (shell) {
00433       // Force the presentation shell to update the display
00434       shell->FlushPendingNotifications(Flush_Display);
00435 
00436       if (mJiggleLayout) {
00437         nsRect r;
00438         mBrowser->GetContentBounds(r);
00439         nscoord oldWidth = r.width;
00440         while (r.width > 100) {
00441           r.width -= 10;
00442           mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
00443         }
00444         while (r.width < oldWidth) {
00445           r.width += 10;
00446           mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
00447         }
00448       }
00449     }
00450 
00451     if (mCrawl) {
00452       FindMoreURLs();
00453     }
00454 
00455     mTimer = do_CreateInstance("@mozilla.org/timer;1");
00456     if(mPrinterTestType>0){
00457       mDelay = 5000;     // printing needs more time to load, so give it plenty
00458     } else {
00459       mDelay = 200;
00460     }    
00461     
00462     if ((0 < mQueuedLoadURLs) || (0 < mPendingURLs.Count())) {
00463       mTimer->InitWithFuncCallback(LoadNextURLCallback, this, mDelay,
00464                                    nsITimer::TYPE_ONE_SHOT);
00465     }
00466     else if (mPostExit) {
00467       mTimer->InitWithFuncCallback(QueueExitCallback, this, mDelay,
00468                                    nsITimer::TYPE_ONE_SHOT);
00469     }
00470   }
00471 
00472   return NS_OK;
00473 }
00474 
00475 NS_IMETHODIMP
00476 nsWebCrawler::OnProgressChange(nsIWebProgress *aWebProgress,
00477                                nsIRequest *aRequest,
00478                                PRInt32 aCurSelfProgress,
00479                                PRInt32 aMaxSelfProgress,
00480                                PRInt32 aCurTotalProgress,
00481                                PRInt32 aMaxTotalProgress)
00482 {
00483     NS_NOTREACHED("notification excluded in AddProgressListener(...)");
00484     return NS_OK;
00485 }
00486 
00487 NS_IMETHODIMP
00488 nsWebCrawler::OnLocationChange(nsIWebProgress* aWebProgress,
00489                                nsIRequest* aRequest,
00490                                nsIURI *location)
00491 {
00492     NS_NOTREACHED("notification excluded in AddProgressListener(...)");
00493     return NS_OK;
00494 }
00495 
00496 
00497 NS_IMETHODIMP
00498 nsWebCrawler::OnStatusChange(nsIWebProgress* aWebProgress,
00499                              nsIRequest* aRequest,
00500                              nsresult aStatus,
00501                              const PRUnichar* aMessage)
00502 {
00503     NS_NOTREACHED("notification excluded in AddProgressListener(...)");
00504     return NS_OK;
00505 }
00506 
00507 
00508 NS_IMETHODIMP
00509 nsWebCrawler::OnSecurityChange(nsIWebProgress *aWebProgress, 
00510                                nsIRequest *aRequest, 
00511                                PRUint32 state)
00512 {
00513     NS_NOTREACHED("notification excluded in AddProgressListener(...)");
00514     return NS_OK;
00515 }
00516 
00517 FILE*
00518 nsWebCrawler::GetOutputFile(nsIURI *aURL, nsString& aOutputName)
00519 {
00520   static const char kDefaultOutputFileName[] = "test.txt";   // the default
00521   FILE *result = nsnull;
00522   if (nsnull!=aURL)
00523   {
00524     char *inputFileName;
00525     nsCAutoString file;
00526     (void)aURL->GetPath(file);
00527     NS_ConvertUTF8toUCS2 inputFileFullPath(file);
00528     PRInt32 fileNameOffset = inputFileFullPath.RFindChar('/');
00529     if (-1==fileNameOffset)
00530     {
00531       inputFileName = new char[strlen(kDefaultOutputFileName) + 1];
00532       strcpy (inputFileName, kDefaultOutputFileName);
00533     }
00534     else
00535     {
00536       PRInt32 len = inputFileFullPath.Length() - fileNameOffset;
00537       inputFileName = new char[len + 1 + 20];
00538       char *c = inputFileName;
00539       for (PRInt32 i=fileNameOffset+1; i<fileNameOffset+len; i++)
00540       {
00541         char ch = (char) inputFileFullPath.CharAt(i);
00542         if (ch == '.') {
00543           // Stop on dot so that we don't keep the old extension
00544           break;
00545         }
00546         *c++ = ch;
00547       }
00548 
00549       // Tack on ".rgd" extension for "regression data"
00550       *c++ = '.';
00551       *c++ = 'r';
00552       *c++ = 'g';
00553       *c++ = 'd';
00554       *c++ = '\0';
00555       aOutputName.Truncate();
00556       aOutputName.AppendWithConversion(inputFileName);
00557     }
00558     nsAutoString outputFileName(mOutputDir);
00559     outputFileName.AppendWithConversion(inputFileName);
00560     PRInt32 bufLen = outputFileName.Length()+1;
00561     char *buf = new char[bufLen+1];
00562     outputFileName.ToCString(buf, bufLen);
00563     result = fopen(buf, "wt");
00564     delete [] buf;
00565     delete [] inputFileName;
00566   }
00567   return result;
00568 }
00569 
00570 void
00571 nsWebCrawler::AddURL(const nsString& aURL)
00572 {
00573   nsString* url = new nsString(aURL);
00574   mPendingURLs.AppendElement(url);
00575   if (mVerbose) {
00576     printf("WebCrawler: adding '");
00577     fputs(NS_LossyConvertUCS2toASCII(aURL).get(), stdout);
00578     printf("'\n");
00579   }
00580 }
00581 
00582 void
00583 nsWebCrawler::AddSafeDomain(const nsString& aDomain)
00584 {
00585   nsString* s = new nsString(aDomain);
00586   mSafeDomains.AppendElement(s);
00587 }
00588 
00589 void
00590 nsWebCrawler::AddAvoidDomain(const nsString& aDomain)
00591 {
00592   nsString* s = new nsString(aDomain);
00593   mAvoidDomains.AppendElement(s);
00594 }
00595 
00596 void 
00597 nsWebCrawler::SetOutputDir(const nsString& aOutputDir)
00598 {
00599   mOutputDir = aOutputDir;
00600 }
00601 
00602 void 
00603 nsWebCrawler::SetRegressionDir(const nsString& aDir)
00604 {
00605   mRegressionDir = aDir;
00606 }
00607 
00608 void
00609 nsWebCrawler::Start()
00610 {
00611   // Enable observing each URL load...
00612   nsCOMPtr<nsIDocShell> docShell;
00613   mBrowser->GetDocShell(*getter_AddRefs(docShell));
00614   if (docShell) {
00615     nsCOMPtr<nsIWebProgress> progress(do_GetInterface(docShell));
00616     if (progress) {
00617       progress->AddProgressListener(this,
00618                                     nsIWebProgress::NOTIFY_STATE_DOCUMENT);
00619       LoadNextURL(PR_FALSE);
00620     }
00621   }
00622 }
00623 
00624 void
00625 nsWebCrawler::EnableCrawler()
00626 {
00627   mCrawl = PR_TRUE;
00628 }
00629 
00630 static const unsigned char kLowerLookup[256] = {
00631   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00632   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00633   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00634   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00635   64,
00636     97,98,99,100,101,102,103,104,105,106,107,108,109,
00637     110,111,112,113,114,115,116,117,118,119,120,121,122,
00638 
00639    91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
00640   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00641 
00642   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
00643   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
00644   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
00645   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
00646   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
00647   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
00648   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
00649   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
00650 };
00651 
00652 static PRBool
00653 EndsWith(const nsString& aDomain, const char* aHost, PRInt32 aHostLen)
00654 {
00655   PRInt32 slen = aDomain.Length();
00656   if (slen < aHostLen) {
00657     return PR_FALSE;
00658   }
00659   const PRUnichar* uc = aDomain.get();
00660   uc += slen - aHostLen;
00661   const PRUnichar* end = uc + aHostLen;
00662   while (uc < end) {
00663     unsigned char uch = (unsigned char) ((*uc++) & 0xff);
00664     unsigned char ch = (unsigned char) ((*aHost++) & 0xff);
00665     if (kLowerLookup[uch] != kLowerLookup[ch]) {
00666       return PR_FALSE;
00667     }
00668   }
00669   return PR_TRUE;
00670 }
00671 
00672 static PRBool
00673 StartsWith(const nsString& s1, const char* s2)
00674 {
00675   PRInt32 s1len = s1.Length();
00676   PRInt32 s2len = strlen(s2);
00677   if (s1len < s2len) {
00678     return PR_FALSE;
00679   }
00680   const PRUnichar* uc = s1.get();
00681   const PRUnichar* end = uc + s2len;
00682   while (uc < end) {
00683     unsigned char uch = (unsigned char) ((*uc++) & 0xff);
00684     unsigned char ch = (unsigned char) ((*s2++) & 0xff);
00685     if (kLowerLookup[uch] != kLowerLookup[ch]) {
00686       return PR_FALSE;
00687     }
00688   }
00689   return PR_TRUE;
00690 }
00691 
00692 PRBool
00693 nsWebCrawler::OkToLoad(const nsString& aURLSpec)
00694 {
00695   if (!StartsWith(aURLSpec, "http:") && !StartsWith(aURLSpec, "ftp:") &&
00696       !StartsWith(aURLSpec, "file:") &&
00697       !StartsWith(aURLSpec, "resource:")) {
00698     return PR_FALSE;
00699   }
00700 
00701   PRBool ok = PR_TRUE;
00702   nsIURI* url;
00703   nsresult rv;
00704   rv = NS_NewURI(&url, aURLSpec);
00705 
00706   if (NS_OK == rv) {
00707     nsCAutoString host;
00708     rv = url->GetHost(host);
00709     if (rv == NS_OK) {
00710       PRInt32 hostlen = host.Length();
00711 
00712       // Check domains to avoid
00713       PRInt32 i, n = mAvoidDomains.Count();
00714       for (i = 0; i < n; i++) {
00715         nsString* s = (nsString*) mAvoidDomains.ElementAt(i);
00716         if (s && EndsWith(*s, host.get(), hostlen)) {
00717           printf("Avoiding '");
00718           fputs(NS_LossyConvertUCS2toASCII(aURLSpec).get(), stdout);
00719           printf("'\n");
00720           return PR_FALSE;
00721         }
00722       }
00723 
00724       // Check domains to stay within
00725       n = mSafeDomains.Count();
00726       if (n == 0) {
00727         // If we don't care then all the domains that we aren't
00728         // avoiding are OK
00729         return PR_TRUE;
00730       }
00731       for (i = 0; i < n; i++) {
00732         nsString* s = (nsString*) mSafeDomains.ElementAt(i);
00733         if (s && EndsWith(*s, host.get(), hostlen)) {
00734           return PR_TRUE;
00735         }
00736       }
00737       ok = PR_FALSE;
00738     }
00739     NS_RELEASE(url);
00740   }
00741   return ok;
00742 }
00743 
00744 void
00745 nsWebCrawler::RecordLoadedURL(const nsString& aURL)
00746 {
00747   if (nsnull != mRecord) {
00748     fputs(NS_LossyConvertUCS2toASCII(aURL).get(), mRecord);
00749     fputs("\n", mRecord);
00750     fflush(mRecord);
00751   }
00752 }
00753 
00754 void
00755 nsWebCrawler::FindURLsIn(nsIDocument* aDocument, nsIContent* aNode)
00756 {
00757   nsIAtom *atom = aNode->Tag();
00758   if ((atom == mLinkTag) || (atom == mFrameTag) || (atom == mIFrameTag)) {
00759     // Get absolute url that tag targets
00760     nsAutoString base, src, absURLSpec;
00761     if (atom == mLinkTag) {
00762       aNode->GetAttr(kNameSpaceID_None, mHrefAttr, src);
00763     }
00764     else {
00765       aNode->GetAttr(kNameSpaceID_None, mSrcAttr, src);
00766     }
00767     nsresult rv;
00768     rv = NS_MakeAbsoluteURI(absURLSpec, src, aDocument->GetDocumentURI());
00769     if (NS_OK == rv) {
00770       nsCOMPtr<nsIAtom> urlAtom = do_GetAtom(absURLSpec);
00771       if (0 == mVisited->Get(urlAtom)) {
00772         // Remember the URL as visited so that we don't go there again
00773         mVisited->Put(urlAtom, "visited");
00774         if (OkToLoad(absURLSpec)) {
00775           mPendingURLs.AppendElement(new nsString(absURLSpec));
00776           if (mVerbose) {
00777             printf("Adding '");
00778             fputs(NS_LossyConvertUCS2toASCII(absURLSpec).get(), stdout);
00779             printf("'\n");
00780           }
00781         }
00782         else {
00783           if (mVerbose) {
00784             printf("Skipping '");
00785             fputs(NS_LossyConvertUCS2toASCII(absURLSpec).get(), stdout);
00786             printf("'\n");
00787           }
00788         }
00789       }
00790       else {
00791         if (mVerbose) {
00792           printf("Already visited '");
00793           fputs(NS_LossyConvertUCS2toASCII(absURLSpec).get(), stdout);
00794           printf("'\n");
00795         }
00796       }
00797     }
00798   }
00799 
00800   PRUint32 i, n = aNode->GetChildCount();
00801   for (i = 0; i < n; ++i) {
00802     FindURLsIn(aDocument, aNode->GetChildAt(i));
00803   }
00804 }
00805 
00806 void
00807 nsWebCrawler::FindMoreURLs()
00808 {
00809   nsCOMPtr<nsIDocShell> docShell;
00810   mBrowser->GetDocShell(*getter_AddRefs(docShell));
00811 
00812   if (docShell) {
00813     nsCOMPtr<nsIContentViewer> cv;
00814     docShell->GetContentViewer(getter_AddRefs(cv));
00815     if (cv) {
00816       nsCOMPtr<nsIDocumentViewer> docv = do_QueryInterface(cv);
00817       if (docv) {
00818         nsCOMPtr<nsIDocument> doc;
00819         docv->GetDocument(getter_AddRefs(doc));
00820         if (doc) {
00821           nsIContent *root = doc->GetRootContent();
00822           if (root) {
00823             FindURLsIn(doc, root);
00824           }
00825         }
00826       }
00827     }
00828   }
00829 }
00830 
00831 void 
00832 nsWebCrawler::SetBrowserWindow(nsBrowserWindow* aWindow) 
00833 {
00834   NS_IF_RELEASE(mBrowser);
00835   mBrowser = aWindow;
00836   NS_IF_ADDREF(mBrowser);
00837 }
00838 
00839 void
00840 nsWebCrawler::GetBrowserWindow(nsBrowserWindow** aWindow)
00841 {
00842   NS_IF_ADDREF(mBrowser);
00843   *aWindow = mBrowser;
00844 }
00845 
00846 void
00847 nsWebCrawler::LoadNextURL(PRBool aQueueLoad)
00848 {
00849   if ((mMaxPages < 0) || (mMaxPages > 0)) {
00850     while (0 != mPendingURLs.Count()) {
00851       nsString* url = NS_REINTERPRET_CAST(nsString*, mPendingURLs.ElementAt(0));
00852       mPendingURLs.RemoveElementAt(0);
00853       if (nsnull != url) {
00854         if (OkToLoad(*url)) {
00855           RecordLoadedURL(*url);
00856           if (aQueueLoad) {
00857             // Call stop to cancel any pending URL Refreshes...
00859             QueueLoadURL(*url);
00860           }
00861           else {
00862             mCurrentURL = *url;
00863             mStartLoad = PR_Now();
00864             nsCOMPtr<nsIDocShell> docShell;
00865             mBrowser->GetDocShell(*getter_AddRefs(docShell));
00866             nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(docShell));
00867             webNav->LoadURI(url->get(), nsIWebNavigation::LOAD_FLAGS_NONE, nsnull, nsnull, nsnull);
00868           }
00869 
00870           if (mMaxPages > 0) {
00871             --mMaxPages;
00872           }
00873           delete url;
00874           return;
00875         }
00876         delete url;
00877       }
00878     }
00879   }
00880 
00881   if (nsnull != mRecord) {
00882     fclose(mRecord);
00883     mRecord = nsnull;
00884   }
00885 
00886 } 
00887 
00888 already_AddRefed<nsIPresShell>
00889 nsWebCrawler::GetPresShell(nsIDocShell* aDocShell)
00890 {
00891   nsIPresShell* shell = nsnull;
00892   nsCOMPtr<nsIDocShell> docShell(aDocShell);
00893   if (!docShell) {
00894     mBrowser->GetDocShell(*getter_AddRefs(docShell));
00895   }
00896   if (docShell) {
00897     docShell->GetPresShell(&shell);
00898   }
00899   return shell;
00900 }
00901 
00902 static FILE*
00903 OpenRegressionFile(const nsString& aBaseName, const nsString& aOutputName)
00904 {
00905   nsAutoString a;
00906   a.Append(aBaseName);
00907   a.AppendLiteral("/");
00908   a.Append(aOutputName);
00909   char* fn = ToNewCString(a);
00910   FILE* fp = fopen(fn, "r");
00911   if (!fp) {
00912     printf("Unable to open regression data file %s\n", fn);
00913   }
00914   delete[] fn;
00915   return fp;
00916 }
00917 
00918 #define BUF_SIZE 1024
00919 // Load up both data files (original and the one we just output) into
00920 // two independent xml content trees. Then compare them.
00921 void
00922 nsWebCrawler::PerformRegressionTest(const nsString& aOutputName)
00923 {
00924   // First load the trees
00925   nsIFrameUtil* fu;
00926   nsresult rv = CallCreateInstance(kFrameUtilCID, &fu);
00927   if (NS_FAILED(rv)) {
00928     printf("Can't find nsIFrameUtil implementation\n");
00929     return;
00930   }
00931   FILE* f1 = OpenRegressionFile(mRegressionDir, aOutputName);
00932   if (!f1) {
00933     NS_RELEASE(fu);
00934     return;
00935   }
00936   FILE* f2 = OpenRegressionFile(mOutputDir, aOutputName);
00937   if (!f2) {
00938     fclose(f1);
00939     NS_RELEASE(fu);
00940     return;
00941   }
00942   rv = fu->CompareRegressionData(f1, f2,mRegressionOutputLevel);
00943   NS_RELEASE(fu);
00944 
00945   char dirName[BUF_SIZE];
00946   char fileName[BUF_SIZE];
00947   mOutputDir.ToCString(dirName, BUF_SIZE-1);
00948   aOutputName.ToCString(fileName, BUF_SIZE-1);
00949 
00950   printf("regression test %s%s %s\n", dirName, fileName, NS_SUCCEEDED(rv) ? "passed" : "failed");
00951 }
00952 
00953 //----------------------------------------------------------------------
00954 
00955 static NS_DEFINE_IID(kEventQueueServiceCID, NS_EVENTQUEUESERVICE_CID);
00956 static NS_DEFINE_IID(kIEventQueueServiceIID, NS_IEVENTQUEUESERVICE_IID);
00957 
00958 static nsresult
00959 QueueEvent(PLEvent* aEvent)
00960 {
00961   nsresult rv;
00962   nsCOMPtr<nsIEventQueueService> eqs =
00963       do_GetService(kEventQueueServiceCID, &rv);
00964   if (NS_FAILED(rv)) {
00965     return rv;
00966   }
00967 
00968   if (eqs) {
00969     nsCOMPtr<nsIEventQueue> eq;
00970     rv = eqs->GetThreadEventQueue(NS_CURRENT_THREAD, getter_AddRefs(eq));
00971     if (eq) {
00972       eq->PostEvent(aEvent);
00973     }
00974   }
00975 
00976   return rv;
00977 }
00978 
00979 //----------------------------------------------------------------------
00980 
00981 struct ExitEvent : public PLEvent {
00982   ExitEvent(nsWebCrawler* aCrawler);
00983   ~ExitEvent();
00984 
00985   void DoIt() {
00986     crawler->Exit();
00987   }
00988 
00989   nsWebCrawler* crawler;
00990 
00991   static void PR_CALLBACK HandleMe(ExitEvent* e);
00992   static void PR_CALLBACK DeleteMe(ExitEvent* e);
00993 };
00994 
00995 ExitEvent::ExitEvent(nsWebCrawler* aCrawler)
00996   : crawler(aCrawler)
00997 {
00998   PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
00999                (PLDestroyEventProc) DeleteMe);
01000   NS_ADDREF(aCrawler);
01001 }
01002 
01003 ExitEvent::~ExitEvent()
01004 {
01005   NS_RELEASE(crawler);
01006 }
01007 
01008 void
01009 ExitEvent::HandleMe(ExitEvent* e)
01010 {
01011   e->DoIt();
01012 }
01013 
01014 void
01015 ExitEvent::DeleteMe(ExitEvent* e)
01016 {
01017   delete e;
01018 }
01019 
01020 void
01021 nsWebCrawler::QueueExit()
01022 {
01023   ExitEvent* event = new ExitEvent(this);
01024   QueueEvent(event);
01025 }
01026 
01027 void
01028 nsWebCrawler::Exit()
01029 {
01030   mViewer->Exit();
01031 }
01032 
01033 //----------------------------------------------------------------------
01034 
01035 struct LoadEvent : public PLEvent {
01036   LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL);
01037   ~LoadEvent();
01038 
01039   void DoIt() {
01040     crawler->GoToQueuedURL(url);
01041   }
01042 
01043   nsString url;
01044   nsWebCrawler* crawler;
01045 
01046   static void PR_CALLBACK HandleMe(LoadEvent* e);
01047   static void PR_CALLBACK DeleteMe(LoadEvent* e);
01048 };
01049 
01050 LoadEvent::LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL)
01051   : url(aURL),
01052     crawler(aCrawler)
01053 {
01054   PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
01055                (PLDestroyEventProc) DeleteMe);
01056   NS_ADDREF(aCrawler);
01057 }
01058 
01059 LoadEvent::~LoadEvent()
01060 {
01061   NS_RELEASE(crawler);
01062 }
01063 
01064 void
01065 LoadEvent::HandleMe(LoadEvent* e)
01066 {
01067   e->DoIt();
01068 }
01069 
01070 void
01071 LoadEvent::DeleteMe(LoadEvent* e)
01072 {
01073   delete e;
01074 }
01075 
01076 void
01077 nsWebCrawler::GoToQueuedURL(const nsString& aURL)
01078 {
01079   nsCOMPtr<nsIDocShell> docShell;
01080   mBrowser->GetDocShell(*getter_AddRefs(docShell));
01081   nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(docShell));
01082   if (webNav) {
01083     mCurrentURL = aURL;
01084     mStartLoad = PR_Now();
01085     webNav->LoadURI(aURL.get(), nsIWebNavigation::LOAD_FLAGS_NONE, nsnull, nsnull, nsnull);
01086   }
01087   mQueuedLoadURLs--;
01088 
01089 }
01090 
01091 nsresult
01092 nsWebCrawler::QueueLoadURL(const nsString& aURL)
01093 {
01094   LoadEvent* event = new LoadEvent(this, aURL);
01095   nsresult rv = QueueEvent(event);
01096   if (NS_SUCCEEDED(rv)) {
01097     mQueuedLoadURLs++;
01098   }
01099   return rv;
01100 }