Back to index

scribus-ng  1.3.4.dfsg+svn20071115
htmlreader.cpp
Go to the documentation of this file.
00001 /*
00002 For general Scribus (>=1.3.2) copyright and licensing information please refer
00003 to the COPYING file provided with the program. Following this notice may exist
00004 a copyright and/or license notice that predates the release of Scribus 1.3.2
00005 for which a new license (GPL+exception) is in place.
00006 */
00007 /***************************************************************************
00008  *   Copyright (C) 2004 by Riku Leino                                      *
00009  *   tsoots@gmail.com                                                      *
00010  *                                                                         *
00011  *   This program is free software; you can redistribute it and/or modify  *
00012  *   it under the terms of the GNU General Public License as published by  *
00013  *   the Free Software Foundation; either version 2 of the License, or     *
00014  *   (at your option) any later version.                                   *
00015  *                                                                         *
00016  *   This program is distributed in the hope that it will be useful,       *
00017  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00018  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00019  *   GNU General Public License for more details.                          *
00020  *                                                                         *
00021  *   You should have received a copy of the GNU General Public License     *
00022  *   along with this program; if not, write to the                         *
00023  *   Free Software Foundation, Inc.,                                       *
00024  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00025  ***************************************************************************/
00026 
00027 #include <qobject.h>
00028 #include "htmlreader.h"
00029 
00030 #ifdef HAVE_XML
00031 
00032 #include "scribusstructs.h"
00033 #include "gtmeasure.h"
00034 
00035 HTMLReader* HTMLReader::hreader = NULL;
00036 bool HTMLReader::elemJustStarted = false;
00037 bool HTMLReader::elemJustFinished = false;
00038 
00039 extern htmlSAXHandlerPtr mySAXHandler;
00040 
00041 HTMLReader::HTMLReader(gtParagraphStyle *ps, gtWriter *w, bool textOnly)
00042 {
00043        pstyle = ps;
00044        defaultColor = ps->getFont()->getColor();
00045        defaultWeight = ps->getFont()->getWeight();
00046        defaultSlant = ps->getFont()->getSlant();
00047        initPStyles();
00048        inH1 = false;
00049        inH2 = false;
00050        inH3 = false;
00051        inA = false;
00052        inCode = false;
00053        inBody = false;
00054        inPre = false;
00055        inP = false;
00056        inCenter = false;
00057        writer = w;
00058        href = "";
00059        extLinks = "";
00060        extIndex = 1;
00061        listLevel = -1;
00062        inOL = false;
00063        wasInOL = false;
00064        inUL = false;
00065        wasInUL = false;
00066        inLI = false;
00067        addedLI = false;
00068        lastCharWasSpace = false;
00069        noFormatting = textOnly;
00070        hreader = this;
00071 }
00072 
00073 void HTMLReader::initPStyles()
00074 {
00075        pstylec = new gtParagraphStyle(*pstyle);
00076        pstylec->setAlignment(CENTER);
00077        pstylec->setName("HTML_center");
00078        gtParagraphStyle* pstyleli = new gtParagraphStyle(*pstyle);
00079        pstyleli->setIndent(pstyleli->getIndent()+50.0);
00080        pstyleli->setName("HTML_li_level-0");
00081        listStyles.push_back(pstyleli);
00082        nextItemNumbers.push_back(1);
00083        pstyleh4 = new gtParagraphStyle(*pstyle);
00084        pstyleh4->getFont()->setSize(pstyle->getFont()->getSize() + 10);
00085        pstyleh4->getFont()->setWeight(BOLD);
00086        pstyleh4->setSpaceAbove(10.0);
00087        pstyleh4->setSpaceBelow(5.0);
00088        pstyleh4->setName("HTML_h4");
00089        pstyleh3 = new gtParagraphStyle(*pstyle);
00090        pstyleh3->getFont()->setSize(pstyle->getFont()->getSize() + 20);
00091        pstyleh3->getFont()->setWeight(BOLD);
00092        pstyleh3->setSpaceAbove(20.0);
00093        pstyleh3->setSpaceBelow(10.0);
00094        pstyleh3->setName("HTML_h3");
00095        pstyleh2 = new gtParagraphStyle(*pstyle);
00096        pstyleh2->getFont()->setSize(pstyle->getFont()->getSize() + 40);
00097        pstyleh2->getFont()->setWeight(BOLD);
00098        pstyleh2->setSpaceAbove(30.0);
00099        pstyleh2->setSpaceBelow(20.0);
00100        pstyleh2->setName("HTML_h2");
00101        pstyleh1 = new gtParagraphStyle(*pstyle);
00102        pstyleh1->getFont()->setSize(pstyle->getFont()->getSize() + 60);
00103        pstyleh1->getFont()->setWeight(BOLD);
00104        pstyleh1->setSpaceAbove(40.0);
00105        pstyleh1->setSpaceBelow(30.0);
00106        pstyleh1->setName("HTML_h1");
00107        pstylecode = new gtParagraphStyle(*pstyle);
00108        pstylecode->getFont()->setName("Courier Regular");
00109        pstylecode->setName("HTML_code");
00110        pstylep = new gtParagraphStyle(*pstyle);
00111        pstylep->setSpaceBelow(gtMeasure::i2d(5, SC_MM));
00112        pstylep->setName("HTML_p");
00113        pstylepre = new gtParagraphStyle(*pstyle);
00114        pstylepre->setName("HTML_pre");
00115 }
00116 
00117 void HTMLReader::startElement(void*, const xmlChar * fullname, const xmlChar ** atts)
00118 {
00119        elemJustStarted = true;
00120        elemJustFinished = false;
00121        QString* name = new QString((const char*) fullname);
00122        name = new QString(name->lower());
00123        QXmlAttributes* attrs = new QXmlAttributes();
00124        if (atts)
00125        {
00126               for(const xmlChar** cur = atts; cur && *cur; cur += 2)
00127                      attrs->append(QString((char*)*cur), NULL, QString((char*)*cur), QString((char*)*(cur + 1)));
00128        }
00129        hreader->startElement(NULL, NULL, *name, *attrs);
00130 }
00131 
00132 bool HTMLReader::startElement(const QString&, const QString&, const QString &name, const QXmlAttributes &attrs) 
00133 {
00134        if (name == "p")
00135               inP = true;
00136        else if (name == "center")
00137               inCenter = true;
00138        else if (name == "br")
00139               writer->append("\n", pstyle);
00140        else if (name == "a")
00141        {
00142               toggleEffect(UNDERLINE);
00143               setBlueFont();
00144               for (int i = 0; i < attrs.count(); i++)
00145               {
00146                      if (attrs.localName(i) == "href")
00147                      {
00148                             href = attrs.value(i);
00149                      }
00150                      inA = true;
00151               }
00152        } 
00153        else if (name == "ul")
00154        {
00155               ++listLevel;
00156               if (static_cast<int>(listStyles.size()) < (listLevel + 1))
00157                      createListStyle();
00158               inUL = true;
00159               if (inOL)
00160               {
00161                      inOL = false;
00162                      wasInOL = true;
00163               }
00164        }
00165        else if (name == "ol")
00166        {
00167               ++listLevel;
00168               if (static_cast<int>(listStyles.size()) < (listLevel + 1))
00169                      createListStyle();
00170               inOL = true;
00171               if (inUL)
00172               {
00173                      inUL = false;
00174                      wasInUL = true;
00175               }
00176        }
00177        else if (name == "li")
00178               inLI = true;
00179        else if (name == "h1")
00180               inH1 = true;
00181        else if (name == "h2")
00182               inH2 = true;
00183        else if (name == "h3")
00184               inH3 = true;
00185        else if (name == "h4")
00186               inH4 = true;
00187        else if ((name == "b") || (name == "strong"))
00188               setBoldFont();
00189        else if ((name == "i") || (name == "em"))
00190               setItalicFont();
00191        else if (name == "code")
00192               inCode = true;
00193        else if (name == "body")
00194               inBody = true;
00195        else if (name == "pre")
00196               inPre = true;
00197        else if (name == "img")
00198        {
00199               QString imgline("(img,");
00200               for (int i = 0; i < attrs.count(); i++)
00201               {
00202                      if (attrs.localName(i) == "src")
00203                      {
00204                             imgline +=  " src: " + attrs.value(i);
00205                      }
00206                      if (attrs.localName(i) == "alt")
00207                      {
00208                             if (!attrs.value(i).isEmpty())
00209                                    imgline += ", alt: " + attrs.value(i);
00210                      }
00211               }
00212               imgline += ")\n\n";
00213               writer->append(imgline, pstyle);
00214        }
00215        else if (name == "sub")
00216               toggleEffect(SUBSCRIPT);
00217        else if (name == "sup")
00218               toggleEffect(SUPERSCRIPT);
00219        else if (name == "del")
00220               toggleEffect(STRIKETHROUGH);
00221        else if ((name == "ins" || name == "u") && (!inA))
00222               toggleEffect(UNDERLINE);
00223        return true;
00224 }
00225 void HTMLReader::characters(void*, const xmlChar * ch, int len)
00226 {
00227        QString chars = QString::fromUtf8((const char*) ch, len);
00228        hreader->characters(chars);
00229 }
00230 
00231 bool HTMLReader::characters(const QString &ch) 
00232 {
00233        if (inBody)
00234        {
00235               QString tmp = ch;
00236               // FIXME : According to html spec, new lines placed just after or just before an element
00237               // must be ignored, not exactly that, but better than nothing
00238               if (elemJustStarted  || elemJustFinished)
00239               {
00240                      while( !tmp.isEmpty() && (tmp[0] == '\r' || tmp[0] == '\n') )
00241                             tmp = tmp.right(tmp.length() - 1);
00242                      elemJustStarted = elemJustFinished = false;
00243                      if (tmp.isEmpty())
00244                             return true;
00245               }
00246               QString chl = tmp.left(1), chr = tmp.right(1);
00247               bool fcis = (chl.length() > 0 && chl[0].isSpace());
00248               bool lcis = (chr.length() > 0 && chr[0].isSpace());
00249               if (inPre)
00250               {
00251                      if (tmp.left(1) == "\n")
00252                             tmp = tmp.right(tmp.length() - 2);
00253               }
00254               else
00255                      tmp = tmp.simplifyWhiteSpace();
00256 
00257               if (!lastCharWasSpace)
00258                      if (fcis)
00259                             tmp = " " + tmp;
00260 
00261               if (lcis)
00262                      tmp = tmp + " ";
00263               lastCharWasSpace = lcis;
00264               if ((inLI) && (!addedLI))
00265               {
00266                      if (inUL)
00267                             tmp = "- " + tmp;
00268                      else if (inOL)
00269                      {
00270                             tmp = QString("%1. ").arg(nextItemNumbers[listLevel]) + tmp;
00271                             ++nextItemNumbers[listLevel];
00272                      }
00273                      addedLI = true;
00274               }
00275 
00276               if (noFormatting)
00277                      writer->append(tmp);
00278               else if (inP)
00279                      writer->append(tmp, pstylep);
00280               else if (inLI)
00281               {
00282                      writer->append(tmp, listStyles[listLevel]);
00283               }
00284               else if (inH1)
00285                      writer->append(tmp, pstyleh1);
00286               else if (inH2)
00287                      writer->append(tmp, pstyleh2);
00288               else if (inH3)
00289                      writer->append(tmp, pstyleh3);
00290               else if (inH4)
00291                      writer->append(tmp, pstyleh4);
00292               else if (inCenter)
00293                      writer->append(tmp, pstylec);
00294               else if (inCode)
00295                      writer->append(tmp, pstylecode);
00296               else if (inPre)
00297                      writer->append(tmp, pstylepre);
00298               else
00299                      writer->append(tmp, pstyle);
00300        }
00301        return true;
00302 }
00303 
00304 void HTMLReader::endElement(void*, const xmlChar * name)
00305 {
00306        elemJustStarted = false;
00307        elemJustFinished = true;
00308        QString *nname = new QString((const char*) name);
00309        nname = new QString(nname->lower());
00310        hreader->endElement(NULL, NULL, *nname);
00311 }
00312 
00313 bool HTMLReader::endElement(const QString&, const QString&, const QString &name)
00314 {
00315        if (name == "center")
00316        {
00317               inCenter = false;
00318               writer->append("\n");
00319        }
00320        else if (name == "p")
00321        {
00322               writer->append("\n");
00323               inP = false;
00324        }
00325        else if (name == "a")
00326        {
00327               toggleEffect(UNDERLINE);
00328               if ((!href.isEmpty()) && ((href.find("//") != -1) ||
00329                   (href.find("mailto:") != -1) || (href.find("www") != -1)))
00330               {
00331                      href = href.remove("mailto:");
00332                      writer->append(QString(" [%1]").arg(extIndex), pstyle);
00333                      extLinks += QString("[%1] ").arg(extIndex) + href + "\n";
00334                      ++extIndex;
00335               }
00336               href = "";
00337               setDefaultColor();
00338               inA = false;
00339        }
00340        else if (name == "ul")
00341        {
00342               if (listLevel == 0)
00343               {
00344                      inUL = false;
00345                      inOL = false;
00346                      wasInUL = false;
00347                      wasInOL = false;
00348                      listLevel = -1;
00349               }
00350               else if (wasInOL)
00351               {
00352                      inUL = false;
00353                      inOL = true;
00354                      wasInOL = false;
00355                      --listLevel;
00356               }
00357               else if (wasInUL)
00358               {
00359                      inUL = true;
00360                      inOL = false;
00361                      wasInUL = false;
00362                      --listLevel;
00363               }
00364               else
00365                      --listLevel;
00366               if (listLevel == -1)
00367                      writer->append("\n");
00368        }
00369        else if (name == "ol")
00370        {
00371               if (listLevel == 0)
00372               {
00373                      inUL = false;
00374                      inOL = false;
00375                      wasInUL = false;
00376                      wasInOL = false;
00377                      listLevel = -1;
00378               }
00379               else if (wasInUL)
00380               {
00381                      inOL = false;
00382                      inUL = true;
00383                      wasInUL = false;
00384                      nextItemNumbers[listLevel] = 1;
00385                      --listLevel;
00386               }
00387               else if (wasInOL)
00388               {
00389                      inOL = true;
00390                      inUL = false;
00391                      wasInOL = false;
00392                      nextItemNumbers[listLevel] = 1;
00393                      --listLevel;
00394               }
00395               else
00396               {
00397                      nextItemNumbers[listLevel] = 1;
00398                      --listLevel;
00399               }
00400               if (listLevel == -1)
00401                      writer->append("\n");
00402        }
00403        else if (name == "li")
00404        {
00405               inLI = false;
00406               addedLI = false;
00407               writer->append("\n");
00408        }
00409        else if (name == "h1")
00410        {
00411               inH1 = false;
00412               writer->append("\n");
00413        }
00414        else if (name == "h2")
00415        {
00416               inH2 = false;
00417               writer->append("\n");
00418        }
00419        else if (name == "h3")
00420        {
00421               inH3 = false;
00422               writer->append("\n");
00423        }
00424        else if (name == "h4")
00425        {
00426               inH4 = false;
00427               writer->append("\n");
00428        }
00429        else if ((name == "b") || (name == "strong"))
00430               unSetBoldFont();
00431        else if ((name == "i") || (name == "em"))
00432               unsetItalicFont();
00433        else if (name == "code")
00434               inCode = false;
00435        else if (name == "body")
00436               inBody = false;
00437        else if (name == "pre")
00438        {
00439               inPre = false;
00440               writer->append("\n");
00441        }
00442        else if (name == "div")
00443               writer->append("\n");
00444        else if (name == "sub")
00445               toggleEffect(SUBSCRIPT);
00446        else if (name == "sup")
00447               toggleEffect(SUPERSCRIPT);
00448        else if (name == "del")
00449               toggleEffect(STRIKETHROUGH);
00450        else if ((name == "ins" || name == "u") && (!inA))
00451               toggleEffect(UNDERLINE);
00452        return true;
00453 }
00454 
00455 void HTMLReader::toggleEffect(FontEffect e)
00456 {
00457        pstyle->getFont()->toggleEffect(e);
00458        pstylec->getFont()->toggleEffect(e);
00459        for (uint i = 0; i < listStyles.size(); ++i)
00460               listStyles[i]->getFont()->toggleEffect(e);
00461        pstyleh1->getFont()->toggleEffect(e);
00462        pstyleh2->getFont()->toggleEffect(e);
00463        pstyleh3->getFont()->toggleEffect(e);
00464        pstyleh4->getFont()->toggleEffect(e);
00465        pstylecode->getFont()->toggleEffect(e);
00466        pstylep->getFont()->toggleEffect(e);
00467        pstylepre->getFont()->toggleEffect(e);
00468 }
00469 
00470 void HTMLReader::setItalicFont()
00471 {
00472        pstyle->getFont()->setSlant(ITALIC);
00473        pstylec->getFont()->setSlant(ITALIC);
00474        for (uint i = 0; i < listStyles.size(); ++i)
00475               listStyles[i]->getFont()->setSlant(ITALIC);
00476        pstyleh1->getFont()->setSlant(ITALIC);
00477        pstyleh2->getFont()->setSlant(ITALIC);
00478        pstyleh3->getFont()->setSlant(ITALIC);
00479        pstyleh4->getFont()->setSlant(ITALIC);
00480        pstylecode->getFont()->setSlant(ITALIC);
00481        pstylep->getFont()->setSlant(ITALIC);
00482        pstylepre->getFont()->setSlant(ITALIC);
00483 }
00484 
00485 void HTMLReader::unsetItalicFont()
00486 {
00487        pstyle->getFont()->setSlant(defaultSlant);
00488        pstylec->getFont()->setSlant(defaultSlant);
00489        for (uint i = 0; i < listStyles.size(); ++i)
00490               listStyles[i]->getFont()->setSlant(defaultSlant);
00491        pstyleh1->getFont()->setSlant(defaultSlant);
00492        pstyleh2->getFont()->setSlant(defaultSlant);
00493        pstyleh3->getFont()->setSlant(defaultSlant);
00494        pstyleh4->getFont()->setSlant(defaultSlant);
00495        pstylecode->getFont()->setSlant(defaultSlant);
00496        pstylep->getFont()->setSlant(defaultSlant);
00497        pstylepre->getFont()->setSlant(defaultSlant);
00498 }
00499 
00500 void HTMLReader::setBlueFont()
00501 {
00502        pstyle->getFont()->setColor("Blue");
00503        pstylec->getFont()->setColor("Blue");
00504        for (uint i = 0; i < listStyles.size(); ++i)
00505               listStyles[i]->getFont()->setColor("Blue");
00506        pstyleh1->getFont()->setColor("Blue");
00507        pstyleh2->getFont()->setColor("Blue");
00508        pstyleh3->getFont()->setColor("Blue");
00509        pstyleh4->getFont()->setColor("Blue");
00510        pstylecode->getFont()->setColor("Blue");
00511        pstylep->getFont()->setColor("Blue");
00512        pstylepre->getFont()->setColor("Blue");
00513 }
00514 
00515 void HTMLReader::setDefaultColor()
00516 {
00517        pstyle->getFont()->setColor(defaultColor);
00518        pstylec->getFont()->setColor(defaultColor);
00519        for (uint i = 0; i < listStyles.size(); ++i)
00520               listStyles[i]->getFont()->setColor(defaultColor);
00521        pstyleh1->getFont()->setColor(defaultColor);
00522        pstyleh2->getFont()->setColor(defaultColor);
00523        pstyleh3->getFont()->setColor(defaultColor);
00524        pstyleh4->getFont()->setColor(defaultColor);
00525        pstylecode->getFont()->setColor(defaultColor);
00526        pstylep->getFont()->setColor(defaultColor);
00527        pstylepre->getFont()->setColor(defaultColor);
00528 }
00529 
00530 void HTMLReader::setBoldFont()
00531 {
00532        pstyle->getFont()->setWeight(BOLD);
00533        pstylec->getFont()->setWeight(BOLD);
00534        for (uint i = 0; i < listStyles.size(); ++i)
00535               listStyles[i]->getFont()->setWeight(BOLD);
00536        pstylecode->getFont()->setWeight(BOLD);
00537        pstylep->getFont()->setWeight(BOLD);
00538        pstylepre->getFont()->setWeight(BOLD);
00539 }
00540 
00541 void HTMLReader::unSetBoldFont()
00542 {
00543        pstyle->getFont()->setWeight(defaultWeight);
00544        pstylec->getFont()->setWeight(defaultWeight);
00545        for (uint i = 0; i < listStyles.size(); ++i)
00546               listStyles[i]->getFont()->setWeight(defaultWeight);
00547        pstylecode->getFont()->setWeight(REGULAR);
00548        pstylep->getFont()->setWeight(defaultWeight);
00549        pstylepre->getFont()->setWeight(defaultWeight);
00550 }
00551 
00552 void HTMLReader::parse(QString filename)
00553 {
00554 #if defined(_WIN32)
00555        QString fname = QDir::convertSeparators(filename);
00556        QCString fn = (qWinVersion() & Qt::WV_NT_based) ? fname.utf8() : fname.local8Bit();
00557 #else
00558        QCString fn(filename.local8Bit());
00559 #endif
00560        elemJustStarted = elemJustFinished = false;
00561        htmlSAXParseFile(fn.data(), NULL, mySAXHandler, NULL);
00562 }
00563 
00564 void HTMLReader::createListStyle()
00565 {
00566        gtParagraphStyle* tmpStyle = new gtParagraphStyle(*listStyles[0]);
00567        tmpStyle->setName(QString("HTML_li_level-%1").arg(listLevel + 1));
00568        double indent = listStyles[0]->getIndent();
00569        indent += 25 * (listLevel + 1);
00570        tmpStyle->setIndent(indent);
00571        listStyles.push_back(tmpStyle);
00572        nextItemNumbers.push_back(1);
00573 }
00574 
00575 htmlSAXHandler mySAXHandlerStruct = {
00576        NULL, // internalSubset,
00577        NULL, // isStandalone,
00578        NULL, // hasInternalSubset,
00579        NULL, // hasExternalSubset,
00580        NULL, // resolveEntity,
00581        NULL, // getEntity,
00582        NULL, // entityDecl,
00583        NULL, // notationDecl,
00584        NULL, // attributeDecl,
00585        NULL, // elementDecl,
00586        NULL, // unparsedEntityDecl,
00587        NULL, // setDocumentLocator,
00588        NULL, // startDocument,
00589        NULL, // endDocument,
00590        HTMLReader::startElement,
00591        HTMLReader::endElement,
00592        NULL, // reference,
00593        HTMLReader::characters,
00594        NULL, // ignorableWhitespace,
00595        NULL, // processingInstruction,
00596        NULL, // comment,
00597        NULL, // warning,
00598        NULL, // error,
00599        NULL, // fatalError,
00600        NULL, // getParameterEntity,
00601        NULL, // cdata,
00602        NULL,
00603        1
00604 #ifdef HAVE_XML26
00605        ,
00606        NULL,
00607        NULL,
00608        NULL,
00609        NULL
00610 #endif
00611 };
00612 
00613 htmlSAXHandlerPtr mySAXHandler = &mySAXHandlerStruct;
00614 
00615 HTMLReader::~HTMLReader()
00616 {
00617        if (!extLinks.isEmpty())
00618        {
00619               writer->append(QObject::tr("\nExternal Links\n"), pstyleh4);
00620               writer->append(extLinks, pstyle);
00621        }
00622        for (uint i = 0; i < listStyles.size(); ++i)
00623               delete listStyles[i];
00624        delete pstylec;
00625        delete pstyleh1;
00626        delete pstyleh2;
00627        delete pstyleh3;
00628        delete pstyleh4;
00629        delete pstylecode;
00630        delete pstylep;
00631        delete pstylepre;
00632        hreader = NULL;
00633 }
00634 
00635 #endif // HAVE_XML