Back to index

lightning-sunbird  0.9+nobinonly
nsWebCrawler.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Communicator client code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Brian Ryner <bryner@brianryner.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 #ifndef nsWebCrawler_h___
00039 #define nsWebCrawler_h___
00040 
00041 #include "nsCOMPtr.h"
00042 #include "nsBrowserWindow.h"
00043 #include "nsIWebProgressListener.h"
00044 #include "nsVoidArray.h"
00045 #include "nsString.h"
00046 #include "nsIAtom.h"
00047 #include "nsWeakReference.h"
00048 #include "nsIURI.h"
00049 
00050 
00051 class nsIContent;
00052 class nsIDocument;
00053 class nsITimer;
00054 class nsIURI;
00055 class nsIPresShell;
00056 class nsViewerApp;
00057 class AtomHashTable;
00058 
00059 class nsWebCrawler : public nsIWebProgressListener,
00060                      public nsSupportsWeakReference {
00061 public:
00062   // Make a new web-crawler for the given viewer. Note: the web
00063   // crawler does not addref the viewer.
00064   nsWebCrawler(nsViewerApp* aViewer);
00065 
00066   // nsISupports
00067   NS_DECL_ISUPPORTS
00068 
00069   // nsIWebProgressListener
00070   NS_DECL_NSIWEBPROGRESSLISTENER
00071 
00072   // Add a url to load
00073   void AddURL(const nsString& aURL);
00074 
00075   // Add a domain that is safe to load url's from
00076   void AddSafeDomain(const nsString& aDomain);
00077 
00078   // Add a domain that must be avoided
00079   void AddAvoidDomain(const nsString& aDomain);
00080 
00081   void SetBrowserWindow(nsBrowserWindow* aWindow);
00082   void GetBrowserWindow(nsBrowserWindow** aWindow);
00083 
00084   void SetPrintTest(PRInt32 aTestType) { mPrinterTestType = aTestType; }
00085 
00086   void RegressionOutput(PRInt32 aRegressionOutputLevel) { mRegressionOutputLevel = aRegressionOutputLevel; }
00087 
00088   void EnableJiggleLayout() {
00089     mJiggleLayout = PR_TRUE;
00090   }
00091 
00092   // If set to TRUE the loader will post an exit message on exit
00093   void SetExitOnDone(PRBool aPostExit) {
00094     mPostExit = aPostExit;
00095   }
00096 
00097   // Start loading documents
00098   void Start();
00099 
00100   // Enable the crawler; when a document contains links to other
00101   // documents the crawler will go to them subject to the limitations
00102   // on the total crawl count and the domain name checks.
00103   void EnableCrawler();
00104 
00105   void SetRecordFile(FILE* aFile) {
00106     mRecord = aFile;
00107   }
00108 
00109   void SetMaxPages(PRInt32 aMax) {
00110     mMaxPages = aMax;
00111   }
00112 
00113   void SetOutputDir(const nsString& aOutputDir);
00114 
00115   void DumpRegressionData();
00116   void SetRegressionDir(const nsString& aOutputDir);
00117 
00118   void SetEnableRegression(PRBool aSetting) {
00119     mRegressing = aSetting;
00120   }
00121 
00122   static void
00123   LoadNextURLCallback(nsITimer* aTimer, void* aClosure);
00124 
00125   void LoadNextURL(PRBool aQueueLoad);
00126 
00127   nsresult QueueLoadURL(const nsString& aURL);
00128 
00129   void GoToQueuedURL(const nsString& aURL);
00130 
00131   static void
00132   QueueExitCallback(nsITimer* atimer, void* aClosure);
00133 
00134   void QueueExit();
00135 
00136   void Exit();
00137 
00138   void SetVerbose(PRBool aSetting) {
00139     mVerbose = aSetting;
00140   }
00141 
00142   PRBool Crawling() const {
00143     return mCrawl;
00144   }
00145 
00146   PRBool LoadingURLList() const {
00147     return mHaveURLList;
00148   }
00149 
00150   void IncludeStyleData(PRBool aIncludeStyle) {
00151     mIncludeStyleInfo = aIncludeStyle;
00152   }
00153 
00154 protected:
00155   virtual ~nsWebCrawler();
00156 
00157   void FindURLsIn(nsIDocument* aDocument, nsIContent* aNode);
00158 
00159   void FindMoreURLs();
00160 
00161   PRBool OkToLoad(const nsString& aURLSpec);
00162 
00163   void RecordLoadedURL(const nsString& aURLSpec);
00164 
00166   FILE* GetOutputFile(nsIURI *aURL, nsString& aOutputName);
00167 
00168   already_AddRefed<nsIPresShell> GetPresShell(nsIDocShell* aDocShell = nsnull);
00169 
00170   void PerformRegressionTest(const nsString& aOutputName);
00171 
00172   nsBrowserWindow* mBrowser;
00173   nsViewerApp* mViewer;
00174   nsCOMPtr<nsITimer> mTimer;
00175   FILE* mRecord;
00176   nsCOMPtr<nsIAtom> mLinkTag;
00177   nsCOMPtr<nsIAtom> mFrameTag;
00178   nsCOMPtr<nsIAtom> mIFrameTag;
00179   nsCOMPtr<nsIAtom> mHrefAttr;
00180   nsCOMPtr<nsIAtom> mSrcAttr;
00181   nsCOMPtr<nsIAtom> mBaseHrefAttr;
00182   AtomHashTable* mVisited;
00183   nsString mOutputDir;
00184 
00185   PRBool mCrawl;
00186   PRBool mHaveURLList;
00187   PRBool mJiggleLayout;
00188   PRBool mPostExit;
00189   PRInt32 mDelay;
00190   PRInt32 mMaxPages;
00191 
00192   nsString mCurrentURL;
00193   nsCOMPtr<nsIURI>  mLastURL;
00194 
00195   PRTime mStartLoad;
00196   PRBool mVerbose;
00197   PRBool mRegressing;
00198   PRInt32 mPrinterTestType;
00199   PRInt32 mRegressionOutputLevel;
00200   nsString mRegressionDir;
00201   PRBool mIncludeStyleInfo;
00202 
00203   nsVoidArray mPendingURLs;
00204   nsVoidArray mSafeDomains;
00205   nsVoidArray mAvoidDomains;
00206 
00207   PRInt32 mQueuedLoadURLs;
00208 };
00209 
00210 #endif /* nsWebCrawler_h___ */