Back to index

nux  3.0.0
tinyxmlparser.cpp
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Inalogic® Inc.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU Lesser General Public License, as
00006  * published by the  Free Software Foundation; either version 2.1 or 3.0
00007  * of the License.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranties of
00011  * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
00012  * PURPOSE.  See the applicable version of the GNU Lesser General Public
00013  * License for more details.
00014  *
00015  * You should have received a copy of both the GNU Lesser General Public
00016  * License along with this program. If not, see <http://www.gnu.org/licenses/>
00017  *
00018  * Authored by: Jay Taoko <jaytaoko@inalogic.com>
00019  *
00020  */
00021 
00022 
00023 /*
00024 www.sourceforge.net/projects/tinyxml
00025 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
00026 
00027 This software is provided 'as-is', without any express or implied
00028 warranty. In no event will the authors be held liable for any
00029 damages arising from the use of this software.
00030 
00031 Permission is granted to anyone to use this software for any
00032 purpose, including commercial applications, and to alter it and
00033 redistribute it freely, subject to the following restrictions:
00034 
00035 1. The origin of this software must not be misrepresented; you must
00036 not claim that you wrote the original software. If you use this
00037 software in a product, an acknowledgment in the product documentation
00038 would be appreciated but is not required.
00039 
00040 2. Altered source versions must be plainly marked as such, and
00041 must not be misrepresented as being the original software.
00042 
00043 3. This notice may not be removed or altered from any source
00044 distribution.
00045 */
00046 
00047 #include <ctype.h>
00048 #include <stddef.h>
00049 
00050 #include "tinyxml.h"
00051 
00052 //#define DEBUG_PARSER
00053 #if defined( DEBUG_PARSER )
00054 #      if defined( DEBUG ) && defined( _MSC_VER )
00055 #             include <windows.h>
00056 #             define TIXML_LOG OutputDebugString
00057 #      else
00058 #             define TIXML_LOG printf
00059 #      endif
00060 #endif
00061 
00062 // Note tha "PutString" hardcodes the same list. This
00063 // is less flexible than it appears. Changing the entries
00064 // or order will break putstring.
00065 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
00066 {
00067   { "&amp;",  5, '&' },
00068   { "&lt;",   4, '<' },
00069   { "&gt;",   4, '>' },
00070   { "&quot;", 6, '\"' },
00071   { "&apos;", 6, '\'' }
00072 };
00073 
00074 // Bunch of unicode info at:
00075 //            http://www.unicode.org/faq/utf_bom.html
00076 // Including the basic of this table, which determines the #bytes in the
00077 // sequence from the lead byte. 1 placed for invalid sequences --
00078 // although the result will be junk, pass it through as much as possible.
00079 // Beware of the non-characters in UTF-8:
00080 //                          ef bb bf (Microsoft "lead bytes")
00081 //                          ef bf be
00082 //                          ef bf bf
00083 
00084 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
00085 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
00086 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
00087 
00088 const int TiXmlBase::utf8ByteTable[256] =
00089 {
00090   //   0      1      2      3      4      5      6      7      8      9      a      b      c      d      e      f
00091   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x00
00092   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x10
00093   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x20
00094   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x30
00095   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x40
00096   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x50
00097   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x60
00098   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x70       End of ASCII range
00099   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x80 0x80 to 0xc1 invalid
00100   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0x90
00101   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0xa0
00102   1,   1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     // 0xb0
00103   1,   1,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     // 0xc0 0xc2 to 0xdf 2 byte
00104   2,   2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     2,     // 0xd0
00105   3,   3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     3,     // 0xe0 0xe0 to 0xef 3 byte
00106   4,   4,     4,     4,     4,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1,     1      // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
00107 };
00108 
00109 
00110 void TiXmlBase::ConvertUTF32ToUTF8 ( unsigned long input, char *output, int *length )
00111 {
00112   const unsigned long BYTE_MASK = 0xBF;
00113   const unsigned long BYTE_MARK = 0x80;
00114   const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00115 
00116   if (input < 0x80)
00117     *length = 1;
00118   else if ( input < 0x800 )
00119     *length = 2;
00120   else if ( input < 0x10000 )
00121     *length = 3;
00122   else if ( input < 0x200000 )
00123     *length = 4;
00124   else
00125   {
00126     *length = 0;  // This code won't covert this correctly anyway.
00127     return;
00128   }
00129 
00130   output += *length;
00131 
00132   // Scary scary fall throughs.
00133   switch (*length)
00134   {
00135     case 4:
00136       --output;
00137       *output = (char) ( (input | BYTE_MARK) & BYTE_MASK);
00138       input >>= 6;
00139     case 3:
00140       --output;
00141       *output = (char) ( (input | BYTE_MARK) & BYTE_MASK);
00142       input >>= 6;
00143     case 2:
00144       --output;
00145       *output = (char) ( (input | BYTE_MARK) & BYTE_MASK);
00146       input >>= 6;
00147     case 1:
00148       --output;
00149       *output = (char) (input | FIRST_BYTE_MARK[*length]);
00150   }
00151 }
00152 
00153 
00154 /*static*/ int TiXmlBase::IsAlpha ( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00155 {
00156   // This will only work for low-ascii, everything else is assumed to be a valid
00157   // letter. I'm not sure this is the best approach, but it is quite tricky trying
00158   // to figure out alhabetical vs. not across encoding. So take a very
00159   // conservative approach.
00160 
00161 //     if ( encoding == TIXML_ENCODING_UTF8 )
00162 //     {
00163   if ( anyByte < 127 )
00164     return isalpha ( anyByte );
00165   else
00166     return 1; // What else to do? The unicode set is huge...get the english ones right.
00167 
00168 //     }
00169 //     else
00170 //     {
00171 //            return isalpha( anyByte );
00172 //     }
00173 }
00174 
00175 
00176 /*static*/ int TiXmlBase::IsAlphaNum ( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00177 {
00178   // This will only work for low-ascii, everything else is assumed to be a valid
00179   // letter. I'm not sure this is the best approach, but it is quite tricky trying
00180   // to figure out alhabetical vs. not across encoding. So take a very
00181   // conservative approach.
00182 
00183 //     if ( encoding == TIXML_ENCODING_UTF8 )
00184 //     {
00185   if ( anyByte < 127 )
00186     return isalnum ( anyByte );
00187   else
00188     return 1; // What else to do? The unicode set is huge...get the english ones right.
00189 
00190 //     }
00191 //     else
00192 //     {
00193 //            return isalnum( anyByte );
00194 //     }
00195 }
00196 
00197 
00198 class TiXmlParsingData
00199 {
00200   friend class TiXmlDocument;
00201 public:
00202   void Stamp ( const char *now, TiXmlEncoding encoding );
00203 
00204   const TiXmlCursor &Cursor()
00205   {
00206     return cursor;
00207   }
00208 
00209 private:
00210   // Only used by the document!
00211   TiXmlParsingData ( const char *start, int _tabsize, int row, int col )
00212   {
00213     assert ( start );
00214     stamp = start;
00215     tabsize = _tabsize;
00216     cursor.row = row;
00217     cursor.col = col;
00218   }
00219 
00220   TiXmlCursor        cursor;
00221   const char         *stamp;
00222   int                       tabsize;
00223 };
00224 
00225 
00226 void TiXmlParsingData::Stamp ( const char *now, TiXmlEncoding encoding )
00227 {
00228   assert ( now );
00229 
00230   // Do nothing if the tabsize is 0.
00231   if ( tabsize < 1 )
00232   {
00233     return;
00234   }
00235 
00236   // Get the current row, column.
00237   int row = cursor.row;
00238   int col = cursor.col;
00239   const char *p = stamp;
00240   assert ( p );
00241 
00242   while ( p < now )
00243   {
00244     // Treat p as unsigned, so we have a happy compiler.
00245     const unsigned char *pU = (const unsigned char *) p;
00246 
00247     // Code contributed by Fletcher Dunn: (modified by lee)
00248     switch (*pU)
00249     {
00250       case 0:
00251         // We *should* never get here, but in case we do, don't
00252         // advance past the terminating null character, ever
00253         return;
00254 
00255       case '\r':
00256         // bump down to the next line
00257         ++row;
00258         col = 0;
00259         // Eat the character
00260         ++p;
00261 
00262         // Check for \r\n sequence, and treat this as a single character
00263         if (*p == '\n')
00264         {
00265           ++p;
00266         }
00267 
00268         break;
00269 
00270       case '\n':
00271         // bump down to the next line
00272         ++row;
00273         col = 0;
00274 
00275         // Eat the character
00276         ++p;
00277 
00278         // Check for \n\r sequence, and treat this as a single
00279         // character.  (Yes, this bizarre thing does occur still
00280         // on some arcane platforms...)
00281         if (*p == '\r')
00282         {
00283           ++p;
00284         }
00285 
00286         break;
00287 
00288       case '\t':
00289         // Eat the character
00290         ++p;
00291 
00292         // Skip to next tab stop
00293         col = (col / tabsize + 1) * tabsize;
00294         break;
00295 
00296       case TIXML_UTF_LEAD_0:
00297 
00298         if ( encoding == TIXML_ENCODING_UTF8 )
00299         {
00300           if ( * (p + 1) && * (p + 2) )
00301           {
00302             // In these cases, don't advance the column. These are
00303             // 0-width spaces.
00304             if ( * (pU + 1) == TIXML_UTF_LEAD_1 && * (pU + 2) == TIXML_UTF_LEAD_2 )
00305               p += 3;
00306             else if ( * (pU + 1) == 0xbfU && * (pU + 2) == 0xbeU )
00307               p += 3;
00308             else if ( * (pU + 1) == 0xbfU && * (pU + 2) == 0xbfU )
00309               p += 3;
00310             else
00311             {
00312               p += 3;  // A normal character.
00313               ++col;
00314             }
00315           }
00316         }
00317         else
00318         {
00319           ++p;
00320           ++col;
00321         }
00322 
00323         break;
00324 
00325       default:
00326 
00327         if ( encoding == TIXML_ENCODING_UTF8 )
00328         {
00329           // Eat the 1 to 4 byte utf8 character.
00330           int step = TiXmlBase::utf8ByteTable[* ( (const unsigned char *) p) ];
00331 
00332           if ( step == 0 )
00333             step = 1;              // Error case from bad encoding, but handle gracefully.
00334 
00335           p += step;
00336 
00337           // Just advance one column, of course.
00338           ++col;
00339         }
00340         else
00341         {
00342           ++p;
00343           ++col;
00344         }
00345 
00346         break;
00347     }
00348   }
00349 
00350   cursor.row = row;
00351   cursor.col = col;
00352   assert ( cursor.row >= -1 );
00353   assert ( cursor.col >= -1 );
00354   stamp = p;
00355   assert ( stamp );
00356 }
00357 
00358 
00359 const char *TiXmlBase::SkipWhiteSpace ( const char *p, TiXmlEncoding encoding )
00360 {
00361   if ( !p || !*p )
00362   {
00363     return 0;
00364   }
00365 
00366   if ( encoding == TIXML_ENCODING_UTF8 )
00367   {
00368     while ( *p )
00369     {
00370       const unsigned char *pU = (const unsigned char *) p;
00371 
00372       // Skip the stupid Microsoft UTF-8 Byte order marks
00373       if (    * (pU + 0) == TIXML_UTF_LEAD_0
00374             && * (pU + 1) == TIXML_UTF_LEAD_1
00375             && * (pU + 2) == TIXML_UTF_LEAD_2 )
00376       {
00377         p += 3;
00378         continue;
00379       }
00380       else if (* (pU + 0) == TIXML_UTF_LEAD_0
00381                && * (pU + 1) == 0xbfU
00382                && * (pU + 2) == 0xbeU )
00383       {
00384         p += 3;
00385         continue;
00386       }
00387       else if (* (pU + 0) == TIXML_UTF_LEAD_0
00388                && * (pU + 1) == 0xbfU
00389                && * (pU + 2) == 0xbfU )
00390       {
00391         p += 3;
00392         continue;
00393       }
00394 
00395       if ( IsWhiteSpace ( *p ) || *p == '\n' || *p == '\r' )          // Still using old rules for white space.
00396         ++p;
00397       else
00398         break;
00399     }
00400   }
00401   else
00402   {
00403     while ( (*p && IsWhiteSpace ( *p ) ) || (*p == '\n') || (*p == '\r' ) )
00404       ++p;
00405   }
00406 
00407   return p;
00408 }
00409 
00410 #ifdef TIXML_USE_STL
00411 /*static*/ bool TiXmlBase::StreamWhiteSpace ( std::istream *in, TIXML_STRING *tag )
00412 {
00413   for ( ;; )
00414   {
00415     if ( !in->good() ) return false;
00416 
00417     int c = in->peek();
00418 
00419     // At this scope, we can't get to a document. So fail silently.
00420     if ( !IsWhiteSpace ( c ) || c <= 0 )
00421       return true;
00422 
00423     *tag += (char) in->get();
00424   }
00425 }
00426 
00427 /*static*/ bool TiXmlBase::StreamTo ( std::istream *in, int character, TIXML_STRING *tag )
00428 {
00429   //assert( character > 0 && character < 128 );  // else it won't work in utf-8
00430   while ( in->good() )
00431   {
00432     int c = in->peek();
00433 
00434     if ( c == character )
00435       return true;
00436 
00437     if ( c <= 0 )           // Silent failure: can't get document at this scope
00438       return false;
00439 
00440     in->get();
00441     *tag += (char) c;
00442   }
00443 
00444   return false;
00445 }
00446 #endif
00447 
00448 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
00449 // "assign" optimization removes over 10% of the execution time.
00450 //
00451 const char *TiXmlBase::ReadName ( const char *p, TIXML_STRING *name, TiXmlEncoding encoding )
00452 {
00453   // Oddly, not supported on some comilers,
00454   //name->clear();
00455   // So use this:
00456   *name = "";
00457   assert ( p );
00458 
00459   // Names start with letters or underscores.
00460   // Of course, in unicode, tinyxml has no idea what a letter *is*. The
00461   // algorithm is generous.
00462   //
00463   // After that, they can be letters, underscores, numbers,
00464   // hyphens, or colons. (Colons are valid ony for namespaces,
00465   // but tinyxml can't tell namespaces from names.)
00466   if (    p && *p
00467           && ( IsAlpha ( (unsigned char) *p, encoding ) || *p == '_' ) )
00468   {
00469     const char *start = p;
00470 
00471     while (          p && *p
00472               &&     (             IsAlphaNum ( (unsigned char ) *p, encoding )
00473                       || *p == '_'
00474                       || *p == '-'
00475                       || *p == '.'
00476                       || *p == ':' ) )
00477     {
00478       //(*name) += *p; // expensive
00479       ++p;
00480     }
00481 
00482     if ( p - start > 0 )
00483     {
00484       name->assign ( start, p - start );
00485     }
00486 
00487     return p;
00488   }
00489 
00490   return 0;
00491 }
00492 
00493 const char *TiXmlBase::GetEntity ( const char *p, char *value, int *length, TiXmlEncoding encoding )
00494 {
00495   // Presume an entity, and pull it out.
00496   TIXML_STRING ent;
00497   int i;
00498   *length = 0;
00499 
00500   if ( * (p + 1) && * (p + 1) == '#' && * (p + 2) )
00501   {
00502     unsigned long ucs = 0;
00503     ptrdiff_t delta = 0;
00504     unsigned mult = 1;
00505 
00506     if ( * (p + 2) == 'x' )
00507     {
00508       // Hexadecimal.
00509       if ( !* (p + 3) ) return 0;
00510 
00511       const char *q = p + 3;
00512       q = strchr ( q, ';' );
00513 
00514       if ( !q || !*q ) return 0;
00515 
00516       delta = q - p;
00517       --q;
00518 
00519       while ( *q != 'x' )
00520       {
00521         if ( *q >= '0' && *q <= '9' )
00522           ucs += mult * (*q - '0');
00523         else if ( *q >= 'a' && *q <= 'f' )
00524           ucs += mult * (*q - 'a' + 10);
00525         else if ( *q >= 'A' && *q <= 'F' )
00526           ucs += mult * (*q - 'A' + 10 );
00527         else
00528           return 0;
00529 
00530         mult *= 16;
00531         --q;
00532       }
00533     }
00534     else
00535     {
00536       // Decimal.
00537       if ( !* (p + 2) ) return 0;
00538 
00539       const char *q = p + 2;
00540       q = strchr ( q, ';' );
00541 
00542       if ( !q || !*q ) return 0;
00543 
00544       delta = q - p;
00545       --q;
00546 
00547       while ( *q != '#' )
00548       {
00549         if ( *q >= '0' && *q <= '9' )
00550           ucs += mult * (*q - '0');
00551         else
00552           return 0;
00553 
00554         mult *= 10;
00555         --q;
00556       }
00557     }
00558 
00559     if ( encoding == TIXML_ENCODING_UTF8 )
00560     {
00561       // convert the UCS to UTF-8
00562       ConvertUTF32ToUTF8 ( ucs, value, length );
00563     }
00564     else
00565     {
00566       *value = (char) ucs;
00567       *length = 1;
00568     }
00569 
00570     return p + delta + 1;
00571   }
00572 
00573   // Now try to match it.
00574   for ( i = 0; i < NUM_ENTITY; ++i )
00575   {
00576     if ( strncmp ( entity[i].str, p, entity[i].strLength ) == 0 )
00577     {
00578       assert ( strlen ( entity[i].str ) == entity[i].strLength );
00579       *value = entity[i].chr;
00580       *length = 1;
00581       return ( p + entity[i].strLength );
00582     }
00583   }
00584 
00585   // So it wasn't an entity, its unrecognized, or something like that.
00586   *value = *p;       // Don't put back the last one, since we return it!
00587   //*length = 1;     // Leave unrecognized entities - this doesn't really work.
00588   // Just writes strange XML.
00589   return p + 1;
00590 }
00591 
00592 
00593 bool TiXmlBase::StringEqual ( const char *p,
00594                               const char *tag,
00595                               bool ignoreCase,
00596                               TiXmlEncoding encoding )
00597 {
00598   assert ( p );
00599   assert ( tag );
00600 
00601   if ( !p || !*p )
00602   {
00603     assert ( 0 );
00604     return false;
00605   }
00606 
00607   const char *q = p;
00608 
00609   if ( ignoreCase )
00610   {
00611     while ( *q && *tag && ToLower ( *q, encoding ) == ToLower ( *tag, encoding ) )
00612     {
00613       ++q;
00614       ++tag;
00615     }
00616 
00617     if ( *tag == 0 )
00618       return true;
00619   }
00620   else
00621   {
00622     while ( *q && *tag && *q == *tag )
00623     {
00624       ++q;
00625       ++tag;
00626     }
00627 
00628     if ( *tag == 0 )        // Have we found the end of the tag, and everything equal?
00629       return true;
00630   }
00631 
00632   return false;
00633 }
00634 
00635 const char *TiXmlBase::ReadText (  const char *p,
00636                                   TIXML_STRING *text,
00637                                   bool trimWhiteSpace,
00638                                   const char *endTag,
00639                                   bool caseInsensitive,
00640                                   TiXmlEncoding encoding )
00641 {
00642   *text = "";
00643 
00644   if (    !trimWhiteSpace                 // certain tags always keep whitespace
00645           || !condenseWhiteSpace ) // if true, whitespace is always kept
00646   {
00647     // Keep all the white space.
00648     while (      p && *p
00649                && !StringEqual ( p, endTag, caseInsensitive, encoding )
00650           )
00651     {
00652       int len;
00653       char cArr[4] = { 0, 0, 0, 0 };
00654       p = GetChar ( p, cArr, &len, encoding );
00655       text->append ( cArr, len );
00656     }
00657   }
00658   else
00659   {
00660     bool whitespace = false;
00661 
00662     // Remove leading white space:
00663     p = SkipWhiteSpace ( p, encoding );
00664 
00665     while (      p && *p
00666                && !StringEqual ( p, endTag, caseInsensitive, encoding ) )
00667     {
00668       if ( *p == '\r' || *p == '\n' )
00669       {
00670         whitespace = true;
00671         ++p;
00672       }
00673       else if ( IsWhiteSpace ( *p ) )
00674       {
00675         whitespace = true;
00676         ++p;
00677       }
00678       else
00679       {
00680         // If we've found whitespace, add it before the
00681         // new character. Any whitespace just becomes a space.
00682         if ( whitespace )
00683         {
00684           (*text) += ' ';
00685           whitespace = false;
00686         }
00687 
00688         int len;
00689         char cArr[4] = { 0, 0, 0, 0 };
00690         p = GetChar ( p, cArr, &len, encoding );
00691 
00692         if ( len == 1 )
00693           (*text) += cArr[0];      // more efficient
00694         else
00695           text->append ( cArr, len );
00696       }
00697     }
00698   }
00699 
00700   if ( p )
00701     p += strlen ( endTag );
00702 
00703   return p;
00704 }
00705 
00706 #ifdef TIXML_USE_STL
00707 
00708 void TiXmlDocument::StreamIn ( std::istream *in, TIXML_STRING *tag )
00709 {
00710   // The basic issue with a document is that we don't know what we're
00711   // streaming. Read something presumed to be a tag (and hope), then
00712   // identify it, and call the appropriate stream method on the tag.
00713   //
00714   // This "pre-streaming" will never read the closing ">" so the
00715   // sub-tag can orient itself.
00716 
00717   if ( !StreamTo ( in, '<', tag ) )
00718   {
00719     SetError ( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00720     return;
00721   }
00722 
00723   while ( in->good() )
00724   {
00725     int tagIndex = (int) tag->length();
00726 
00727     while ( in->good() && in->peek() != '>' )
00728     {
00729       int c = in->get();
00730 
00731       if ( c <= 0 )
00732       {
00733         SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00734         break;
00735       }
00736 
00737       (*tag) += (char) c;
00738     }
00739 
00740     if ( in->good() )
00741     {
00742       // We now have something we presume to be a node of
00743       // some sort. Identify it, and call the node to
00744       // continue streaming.
00745       TiXmlNode *node = Identify ( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
00746 
00747       if ( node )
00748       {
00749         node->StreamIn ( in, tag );
00750         bool isElement = node->ToElement() != 0;
00751         delete node;
00752         node = 0;
00753 
00754         // If this is the root element, we're done. Parsing will be
00755         // done by the >> operator.
00756         if ( isElement )
00757         {
00758           return;
00759         }
00760       }
00761       else
00762       {
00763         SetError ( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00764         return;
00765       }
00766     }
00767   }
00768 
00769   // We should have returned sooner.
00770   SetError ( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00771 }
00772 
00773 #endif
00774 
00775 const char *TiXmlDocument::Parse ( const char *p, TiXmlParsingData *prevData, TiXmlEncoding encoding )
00776 {
00777   ClearError();
00778 
00779   // Parse away, at the document level. Since a document
00780   // contains nothing but other tags, most of what happens
00781   // here is skipping white space.
00782   if ( !p || !*p )
00783   {
00784     SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00785     return 0;
00786   }
00787 
00788   // Note that, for a document, this needs to come
00789   // before the while space skip, so that parsing
00790   // starts from the pointer we are given.
00791   location.Clear();
00792 
00793   if ( prevData )
00794   {
00795     location.row = prevData->cursor.row;
00796     location.col = prevData->cursor.col;
00797   }
00798   else
00799   {
00800     location.row = 0;
00801     location.col = 0;
00802   }
00803 
00804   TiXmlParsingData data ( p, TabSize(), location.row, location.col );
00805   location = data.Cursor();
00806 
00807   if ( encoding == TIXML_ENCODING_UNKNOWN )
00808   {
00809     // Check for the Microsoft UTF-8 lead bytes.
00810     const unsigned char *pU = (const unsigned char *) p;
00811 
00812     if (      * (pU + 0) && * (pU + 0) == TIXML_UTF_LEAD_0
00813           && * (pU + 1) && * (pU + 1) == TIXML_UTF_LEAD_1
00814           && * (pU + 2) && * (pU + 2) == TIXML_UTF_LEAD_2 )
00815     {
00816       encoding = TIXML_ENCODING_UTF8;
00817       useMicrosoftBOM = true;
00818     }
00819   }
00820 
00821   p = SkipWhiteSpace ( p, encoding );
00822 
00823   if ( !p )
00824   {
00825     SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00826     return 0;
00827   }
00828 
00829   while ( p && *p )
00830   {
00831     TiXmlNode *node = Identify ( p, encoding );
00832 
00833     if ( node )
00834     {
00835       p = node->Parse ( p, &data, encoding );
00836       LinkEndChild ( node );
00837       
00838       /* LinkEndChild may potentially free the node.
00839         If this happens we should break to avoid dereferencing it */
00840       if ( !node )
00841        break;
00842     }
00843     else
00844     {
00845       break;
00846     }
00847 
00848     // Did we get encoding info?
00849     if (    encoding == TIXML_ENCODING_UNKNOWN
00850             && node->ToDeclaration() )
00851     {
00852       TiXmlDeclaration *dec = node->ToDeclaration();
00853       const char *enc = dec->Encoding();
00854       assert ( enc );
00855 
00856       if ( *enc == 0 )
00857         encoding = TIXML_ENCODING_UTF8;
00858       else if ( StringEqual ( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
00859         encoding = TIXML_ENCODING_UTF8;
00860       else if ( StringEqual ( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
00861         encoding = TIXML_ENCODING_UTF8;   // incorrect, but be nice
00862       else
00863         encoding = TIXML_ENCODING_LEGACY;
00864     }
00865 
00866     p = SkipWhiteSpace ( p, encoding );
00867   }
00868 
00869   // Was this empty?
00870   if ( !firstChild )
00871   {
00872     SetError ( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
00873     return 0;
00874   }
00875 
00876   // All is well.
00877   return p;
00878 }
00879 
00880 void TiXmlDocument::SetError ( int err, const char *pError, TiXmlParsingData *data, TiXmlEncoding encoding )
00881 {
00882   // The first error in a chain is more accurate - don't set again!
00883   if ( error )
00884     return;
00885 
00886   assert ( err > 0 && err < TIXML_ERROR_STRING_COUNT );
00887   error   = true;
00888   errorId = err;
00889   errorDesc = errorString[ errorId ];
00890 
00891   errorLocation.Clear();
00892 
00893   if ( pError && data )
00894   {
00895     data->Stamp ( pError, encoding );
00896     errorLocation = data->Cursor();
00897   }
00898 }
00899 
00900 
00901 TiXmlNode *TiXmlNode::Identify ( const char *p, TiXmlEncoding encoding )
00902 {
00903   TiXmlNode *returnNode = 0;
00904 
00905   p = SkipWhiteSpace ( p, encoding );
00906 
00907   if ( !p || !*p || *p != '<' )
00908   {
00909     return 0;
00910   }
00911 
00912   TiXmlDocument *doc = GetDocument();
00913   p = SkipWhiteSpace ( p, encoding );
00914 
00915   if ( !p || !*p )
00916   {
00917     return 0;
00918   }
00919 
00920   // What is this thing?
00921   // - Elements start with a letter or underscore, but xml is reserved.
00922   // - Comments: <!--
00923   // - Decleration: <?xml
00924   // - Everthing else is unknown to tinyxml.
00925   //
00926 
00927   const char *xmlHeader = { "<?xml" };
00928   const char *commentHeader = { "<!--" };
00929   const char *dtdHeader = { "<!" };
00930   const char *cdataHeader = { "<![CDATA[" };
00931 
00932   if ( StringEqual ( p, xmlHeader, true, encoding ) )
00933   {
00934 #ifdef DEBUG_PARSER
00935     TIXML_LOG ( "XML parsing Declaration\n" );
00936 #endif
00937     returnNode = new TiXmlDeclaration();
00938   }
00939   else if ( StringEqual ( p, commentHeader, false, encoding ) )
00940   {
00941 #ifdef DEBUG_PARSER
00942     TIXML_LOG ( "XML parsing Comment\n" );
00943 #endif
00944     returnNode = new TiXmlComment();
00945   }
00946   else if ( StringEqual ( p, cdataHeader, false, encoding ) )
00947   {
00948 #ifdef DEBUG_PARSER
00949     TIXML_LOG ( "XML parsing CDATA\n" );
00950 #endif
00951     TiXmlText *text = new TiXmlText ( "" );
00952     text->SetCDATA ( true );
00953     returnNode = text;
00954   }
00955   else if ( StringEqual ( p, dtdHeader, false, encoding ) )
00956   {
00957 #ifdef DEBUG_PARSER
00958     TIXML_LOG ( "XML parsing Unknown(1)\n" );
00959 #endif
00960     returnNode = new TiXmlUnknown();
00961   }
00962   else if (    IsAlpha ( * (p + 1), encoding )
00963                || * (p + 1) == '_' )
00964   {
00965 #ifdef DEBUG_PARSER
00966     TIXML_LOG ( "XML parsing Element\n" );
00967 #endif
00968     returnNode = new TiXmlElement ( "" );
00969   }
00970   else
00971   {
00972 #ifdef DEBUG_PARSER
00973     TIXML_LOG ( "XML parsing Unknown(2)\n" );
00974 #endif
00975     returnNode = new TiXmlUnknown();
00976   }
00977 
00978   if ( returnNode )
00979   {
00980     // Set the parent, so it can report errors
00981     returnNode->parent = this;
00982   }
00983   else
00984   {
00985     if ( doc )
00986       doc->SetError ( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
00987   }
00988 
00989   return returnNode;
00990 }
00991 
00992 #ifdef TIXML_USE_STL
00993 
00994 void TiXmlElement::StreamIn (std::istream *in, TIXML_STRING *tag)
00995 {
00996   // We're called with some amount of pre-parsing. That is, some of "this"
00997   // element is in "tag". Go ahead and stream to the closing ">"
00998   while ( in->good() )
00999   {
01000     int c = in->get();
01001 
01002     if ( c <= 0 )
01003     {
01004       TiXmlDocument *document = GetDocument();
01005 
01006       if ( document )
01007         document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01008 
01009       return;
01010     }
01011 
01012     (*tag) += (char) c ;
01013 
01014     if ( c == '>' )
01015       break;
01016   }
01017 
01018   if ( tag->length() < 3 ) return;
01019 
01020   // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
01021   // If not, identify and stream.
01022 
01023   if (    tag->at ( tag->length() - 1 ) == '>'
01024           && tag->at ( tag->length() - 2 ) == '/' )
01025   {
01026     // All good!
01027     return;
01028   }
01029   else if ( tag->at ( tag->length() - 1 ) == '>' )
01030   {
01031     // There is more. Could be:
01032     //        text
01033     //        cdata text (which looks like another node)
01034     //        closing tag
01035     //        another node.
01036     for ( ;; )
01037     {
01038       StreamWhiteSpace ( in, tag );
01039 
01040       // Do we have text?
01041       if ( in->good() && in->peek() != '<' )
01042       {
01043         // Yep, text.
01044         TiXmlText text ( "" );
01045         text.StreamIn ( in, tag );
01046 
01047         // What follows text is a closing tag or another node.
01048         // Go around again and figure it out.
01049         continue;
01050       }
01051 
01052       // We now have either a closing tag...or another node.
01053       // We should be at a "<", regardless.
01054       if ( !in->good() ) return;
01055 
01056       assert ( in->peek() == '<' );
01057       int tagIndex = (int) tag->length();
01058 
01059       bool closingTag = false;
01060       bool firstCharFound = false;
01061 
01062       for ( ;; )
01063       {
01064         if ( !in->good() )
01065           return;
01066 
01067         int c = in->peek();
01068 
01069         if ( c <= 0 )
01070         {
01071           TiXmlDocument *document = GetDocument();
01072 
01073           if ( document )
01074             document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01075 
01076           return;
01077         }
01078 
01079         if ( c == '>' )
01080           break;
01081 
01082         *tag += (char) c;
01083         in->get();
01084 
01085         // Early out if we find the CDATA id.
01086         if ( c == '[' && tag->size() >= 9 )
01087         {
01088           size_t len = tag->size();
01089           const char *start = tag->c_str() + len - 9;
01090 
01091           if ( strcmp ( start, "<![CDATA[" ) == 0 )
01092           {
01093             assert ( !closingTag );
01094             break;
01095           }
01096         }
01097 
01098         if ( !firstCharFound && c != '<' && !IsWhiteSpace ( c ) )
01099         {
01100           firstCharFound = true;
01101 
01102           if ( c == '/' )
01103             closingTag = true;
01104         }
01105       }
01106 
01107       // If it was a closing tag, then read in the closing '>' to clean up the input stream.
01108       // If it was not, the streaming will be done by the tag.
01109       if ( closingTag )
01110       {
01111         if ( !in->good() )
01112           return;
01113 
01114         int c = in->get();
01115 
01116         if ( c <= 0 )
01117         {
01118           TiXmlDocument *document = GetDocument();
01119 
01120           if ( document )
01121             document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01122 
01123           return;
01124         }
01125 
01126         assert ( c == '>' );
01127         *tag += (char) c;
01128 
01129         // We are done, once we've found our closing tag.
01130         return;
01131       }
01132       else
01133       {
01134         // If not a closing tag, id it, and stream.
01135         const char *tagloc = tag->c_str() + tagIndex;
01136         TiXmlNode *node = Identify ( tagloc, TIXML_DEFAULT_ENCODING );
01137 
01138         if ( !node )
01139           return;
01140 
01141         node->StreamIn ( in, tag );
01142         delete node;
01143         node = 0;
01144 
01145         // No return: go around from the beginning: text, closing tag, or node.
01146       }
01147     }
01148   }
01149 }
01150 #endif
01151 
01152 const char *TiXmlElement::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01153 {
01154   p = SkipWhiteSpace ( p, encoding );
01155   TiXmlDocument *document = GetDocument();
01156 
01157   if ( !p || !*p )
01158   {
01159     if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
01160 
01161     return 0;
01162   }
01163 
01164   if ( data )
01165   {
01166     data->Stamp ( p, encoding );
01167     location = data->Cursor();
01168   }
01169 
01170   if ( *p != '<' )
01171   {
01172     if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
01173 
01174     return 0;
01175   }
01176 
01177   p = SkipWhiteSpace ( p + 1, encoding );
01178 
01179   // Read the name.
01180   const char *pErr = p;
01181 
01182   p = ReadName ( p, &value, encoding );
01183 
01184   if ( !p || !*p )
01185   {
01186     if ( document )  document->SetError ( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
01187 
01188     return 0;
01189   }
01190 
01191   TIXML_STRING endTag ("</");
01192   endTag += value;
01193   endTag += ">";
01194 
01195   // Check for and read attributes. Also look for an empty
01196   // tag or an end tag.
01197   while ( p && *p )
01198   {
01199     pErr = p;
01200     p = SkipWhiteSpace ( p, encoding );
01201 
01202     if ( !p || !*p )
01203     {
01204       if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01205 
01206       return 0;
01207     }
01208 
01209     if ( *p == '/' )
01210     {
01211       ++p;
01212 
01213       // Empty tag.
01214       if ( *p  != '>' )
01215       {
01216         if ( document ) document->SetError ( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
01217 
01218         return 0;
01219       }
01220 
01221       return (p + 1);
01222     }
01223     else if ( *p == '>' )
01224     {
01225       // Done with attributes (if there were any.)
01226       // Read the value -- which can include other
01227       // elements -- read the end tag, and return.
01228       ++p;
01229       p = ReadValue ( p, data, encoding );              // Note this is an Element method, and will set the error if one happens.
01230 
01231       if ( !p || !*p )
01232       {
01233         // We were looking for the end tag, but found nothing.
01234         // Fix for [ 1663758 ] Failure to report error on bad XML
01235         if ( document ) document->SetError ( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01236 
01237         return 0;
01238       }
01239 
01240       // We should find the end tag now
01241       if ( StringEqual ( p, endTag.c_str(), false, encoding ) )
01242       {
01243         p += endTag.length();
01244         return p;
01245       }
01246       else
01247       {
01248         if ( document ) document->SetError ( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01249 
01250         return 0;
01251       }
01252     }
01253     else
01254     {
01255       // Try to read an attribute:
01256       TiXmlAttribute *attrib = new TiXmlAttribute();
01257 
01258       if ( !attrib )
01259       {
01260         if ( document ) document->SetError ( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
01261 
01262         return 0;
01263       }
01264 
01265       attrib->SetDocument ( document );
01266       pErr = p;
01267       p = attrib->Parse ( p, data, encoding );
01268 
01269       if ( !p || !*p )
01270       {
01271         if ( document ) document->SetError ( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01272 
01273         delete attrib;
01274         return 0;
01275       }
01276 
01277       // Handle the strange case of double attributes:
01278 #ifdef TIXML_USE_STL
01279       TiXmlAttribute *node = attributeSet.Find ( attrib->NameTStr() );
01280 #else
01281       TiXmlAttribute *node = attributeSet.Find ( attrib->Name() );
01282 #endif
01283 
01284       if ( node )
01285       {
01286         node->SetValue ( attrib->Value() );
01287         delete attrib;
01288         return 0;
01289       }
01290 
01291       attributeSet.Add ( attrib );
01292     }
01293   }
01294 
01295   return p;
01296 }
01297 
01298 
01299 const char *TiXmlElement::ReadValue ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01300 {
01301   TiXmlDocument *document = GetDocument();
01302 
01303   // Read in text and elements in any order.
01304   const char *pWithWhiteSpace = p;
01305   p = SkipWhiteSpace ( p, encoding );
01306 
01307   while ( p && *p )
01308   {
01309     if ( *p != '<' )
01310     {
01311       // Take what we have, make a text element.
01312       TiXmlText *textNode = new TiXmlText ( "" );
01313 
01314       if ( !textNode )
01315       {
01316         if ( document ) document->SetError ( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
01317 
01318         return 0;
01319       }
01320 
01321       if ( TiXmlBase::IsWhiteSpaceCondensed() )
01322       {
01323         p = textNode->Parse ( p, data, encoding );
01324       }
01325       else
01326       {
01327         // Special case: we want to keep the white space
01328         // so that leading spaces aren't removed.
01329         p = textNode->Parse ( pWithWhiteSpace, data, encoding );
01330       }
01331 
01332       if ( !textNode->Blank() )
01333         LinkEndChild ( textNode );
01334       else
01335         delete textNode;
01336     }
01337     else
01338     {
01339       // We hit a '<'
01340       // Have we hit a new element or an end tag? This could also be
01341       // a TiXmlText in the "CDATA" style.
01342       if ( StringEqual ( p, "</", false, encoding ) )
01343       {
01344         return p;
01345       }
01346       else
01347       {
01348         TiXmlNode *node = Identify ( p, encoding );
01349 
01350         if ( node )
01351         {
01352           p = node->Parse ( p, data, encoding );
01353           LinkEndChild ( node );
01354         }
01355         else
01356         {
01357           return 0;
01358         }
01359       }
01360     }
01361 
01362     pWithWhiteSpace = p;
01363     p = SkipWhiteSpace ( p, encoding );
01364   }
01365 
01366   if ( !p )
01367   {
01368     if ( document ) document->SetError ( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
01369   }
01370 
01371   return p;
01372 }
01373 
01374 
01375 #ifdef TIXML_USE_STL
01376 void TiXmlUnknown::StreamIn ( std::istream *in, TIXML_STRING *tag )
01377 {
01378   while ( in->good() )
01379   {
01380     int c = in->get();
01381 
01382     if ( c <= 0 )
01383     {
01384       TiXmlDocument *document = GetDocument();
01385 
01386       if ( document )
01387         document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01388 
01389       return;
01390     }
01391 
01392     (*tag) += (char) c;
01393 
01394     if ( c == '>' )
01395     {
01396       // All is well.
01397       return;
01398     }
01399   }
01400 }
01401 #endif
01402 
01403 
01404 const char *TiXmlUnknown::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01405 {
01406   TiXmlDocument *document = GetDocument();
01407   p = SkipWhiteSpace ( p, encoding );
01408 
01409   if ( data )
01410   {
01411     data->Stamp ( p, encoding );
01412     location = data->Cursor();
01413   }
01414 
01415   if ( !p || !*p || *p != '<' )
01416   {
01417     if ( document ) document->SetError ( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
01418 
01419     return 0;
01420   }
01421 
01422   ++p;
01423   value = "";
01424 
01425   while ( p && *p && *p != '>' )
01426   {
01427     value += *p;
01428     ++p;
01429   }
01430 
01431   if ( !p )
01432   {
01433     if ( document )  document->SetError ( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
01434   }
01435 
01436   if ( *p == '>' )
01437     return p + 1;
01438 
01439   return p;
01440 }
01441 
01442 #ifdef TIXML_USE_STL
01443 void TiXmlComment::StreamIn ( std::istream *in, TIXML_STRING *tag )
01444 {
01445   while ( in->good() )
01446   {
01447     int c = in->get();
01448 
01449     if ( c <= 0 )
01450     {
01451       TiXmlDocument *document = GetDocument();
01452 
01453       if ( document )
01454         document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01455 
01456       return;
01457     }
01458 
01459     (*tag) += (char) c;
01460 
01461     if ( c == '>'
01462          && tag->at ( tag->length() - 2 ) == '-'
01463          && tag->at ( tag->length() - 3 ) == '-' )
01464     {
01465       // All is well.
01466       return;
01467     }
01468   }
01469 }
01470 #endif
01471 
01472 
01473 const char *TiXmlComment::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01474 {
01475   TiXmlDocument *document = GetDocument();
01476   value = "";
01477 
01478   p = SkipWhiteSpace ( p, encoding );
01479 
01480   if ( data )
01481   {
01482     data->Stamp ( p, encoding );
01483     location = data->Cursor();
01484   }
01485 
01486   const char *startTag = "<!--";
01487 
01488   const char *endTag   = "-->";
01489 
01490   if ( !StringEqual ( p, startTag, false, encoding ) )
01491   {
01492     document->SetError ( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
01493     return 0;
01494   }
01495 
01496   p += strlen ( startTag );
01497 
01498   // [ 1475201 ] TinyXML parses entities in comments
01499   // Oops - ReadText doesn't work, because we don't want to parse the entities.
01500   // p = ReadText( p, &value, false, endTag, false, encoding );
01501   //
01502   // from the XML spec:
01503   /*
01504    [Definition: Comments may appear anywhere in a document outside other markup; in addition,
01505                 they may appear within the document type declaration at places allowed by the grammar.
01506                        They are not part of the document's character data; an XML processor MAY, but need not,
01507                        make it possible for an application to retrieve the text of comments. For compatibility,
01508                        the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
01509                        references MUST NOT be recognized within comments.
01510 
01511                        An example of a comment:
01512 
01513                        <!-- declarations for <head> & <body> -->
01514   */
01515 
01516   value = "";
01517 
01518   // Keep all the white space.
01519   while (     p && *p && !StringEqual ( p, endTag, false, encoding ) )
01520   {
01521     value.append ( p, 1 );
01522     ++p;
01523   }
01524 
01525   if ( p )
01526     p += strlen ( endTag );
01527 
01528   return p;
01529 }
01530 
01531 
01532 const char *TiXmlAttribute::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01533 {
01534   p = SkipWhiteSpace ( p, encoding );
01535 
01536   if ( !p || !*p ) return 0;
01537 
01538 //     int tabsize = 4;
01539 //     if ( document )
01540 //            tabsize = document->TabSize();
01541 
01542   if ( data )
01543   {
01544     data->Stamp ( p, encoding );
01545     location = data->Cursor();
01546   }
01547 
01548   // Read the name, the '=' and the value.
01549   const char *pErr = p;
01550   p = ReadName ( p, &name, encoding );
01551 
01552   if ( !p || !*p )
01553   {
01554     if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01555 
01556     return 0;
01557   }
01558 
01559   p = SkipWhiteSpace ( p, encoding );
01560 
01561   if ( !p || !*p || *p != '=' )
01562   {
01563     if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01564 
01565     return 0;
01566   }
01567 
01568   ++p; // skip '='
01569   p = SkipWhiteSpace ( p, encoding );
01570 
01571   if ( !p || !*p )
01572   {
01573     if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01574 
01575     return 0;
01576   }
01577 
01578   const char *end;
01579 
01580   const char SINGLE_QUOTE = '\'';
01581 
01582   const char DOUBLE_QUOTE = '\"';
01583 
01584   if ( *p == SINGLE_QUOTE )
01585   {
01586     ++p;
01587     end = "\'";             // single quote in string
01588     p = ReadText ( p, &value, false, end, false, encoding );
01589   }
01590   else if ( *p == DOUBLE_QUOTE )
01591   {
01592     ++p;
01593     end = "\"";             // double quote in string
01594     p = ReadText ( p, &value, false, end, false, encoding );
01595   }
01596   else
01597   {
01598     // All attribute values should be in single or double quotes.
01599     // But this is such a common error that the parser will try
01600     // its best, even without them.
01601     value = "";
01602 
01603     while (    p && *p                                                                            // existence
01604                && !IsWhiteSpace ( *p ) && *p != '\n' && *p != '\r'    // whitespace
01605                && *p != '/' && *p != '>' )                                                 // tag end
01606     {
01607       if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE )
01608       {
01609         // [ 1451649 ] Attribute values with trailing quotes not handled correctly
01610         // We did not have an opening quote but seem to have a
01611         // closing one. Give up and throw an error.
01612         if ( document ) document->SetError ( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01613 
01614         return 0;
01615       }
01616 
01617       value += *p;
01618       ++p;
01619     }
01620   }
01621 
01622   return p;
01623 }
01624 
01625 #ifdef TIXML_USE_STL
01626 void TiXmlText::StreamIn ( std::istream *in, TIXML_STRING *tag )
01627 {
01628   while ( in->good() )
01629   {
01630     int c = in->peek();
01631 
01632     if ( !cdata && (c == '<' ) )
01633     {
01634       return;
01635     }
01636 
01637     if ( c <= 0 )
01638     {
01639       TiXmlDocument *document = GetDocument();
01640 
01641       if ( document )
01642         document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01643 
01644       return;
01645     }
01646 
01647     (*tag) += (char) c;
01648     in->get();       // "commits" the peek made above
01649 
01650     if ( cdata && c == '>' && tag->size() >= 3 )
01651     {
01652       size_t len = tag->size();
01653 
01654       if ( (*tag) [len-2] == ']' && (*tag) [len-3] == ']' )
01655       {
01656         // terminator of cdata.
01657         return;
01658       }
01659     }
01660   }
01661 }
01662 #endif
01663 
01664 const char *TiXmlText::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding encoding )
01665 {
01666   value = "";
01667   TiXmlDocument *document = GetDocument();
01668 
01669   if ( data )
01670   {
01671     data->Stamp ( p, encoding );
01672     location = data->Cursor();
01673   }
01674 
01675   const char *const startTag = "<![CDATA[";
01676 
01677   const char *const endTag   = "]]>";
01678 
01679   if ( cdata || StringEqual ( p, startTag, false, encoding ) )
01680   {
01681     cdata = true;
01682 
01683     if ( !StringEqual ( p, startTag, false, encoding ) )
01684     {
01685       document->SetError ( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
01686       return 0;
01687     }
01688 
01689     p += strlen ( startTag );
01690 
01691     // Keep all the white space, ignore the encoding, etc.
01692     while (      p && *p
01693                && !StringEqual ( p, endTag, false, encoding )
01694           )
01695     {
01696       value += *p;
01697       ++p;
01698     }
01699 
01700     TIXML_STRING dummy;
01701     p = ReadText ( p, &dummy, false, endTag, false, encoding );
01702     return p;
01703   }
01704   else
01705   {
01706     bool ignoreWhite = true;
01707 
01708     const char *end = "<";
01709     p = ReadText ( p, &value, ignoreWhite, end, false, encoding );
01710 
01711     if ( p )
01712       return p - 1;  // don't truncate the '<'
01713 
01714     return 0;
01715   }
01716 }
01717 
01718 #ifdef TIXML_USE_STL
01719 void TiXmlDeclaration::StreamIn ( std::istream *in, TIXML_STRING *tag )
01720 {
01721   while ( in->good() )
01722   {
01723     int c = in->get();
01724 
01725     if ( c <= 0 )
01726     {
01727       TiXmlDocument *document = GetDocument();
01728 
01729       if ( document )
01730         document->SetError ( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01731 
01732       return;
01733     }
01734 
01735     (*tag) += (char) c;
01736 
01737     if ( c == '>' )
01738     {
01739       // All is well.
01740       return;
01741     }
01742   }
01743 }
01744 #endif
01745 
01746 const char *TiXmlDeclaration::Parse ( const char *p, TiXmlParsingData *data, TiXmlEncoding _encoding )
01747 {
01748   p = SkipWhiteSpace ( p, _encoding );
01749   // Find the beginning, find the end, and look for
01750   // the stuff in-between.
01751   TiXmlDocument *document = GetDocument();
01752 
01753   if ( !p || !*p || !StringEqual ( p, "<?xml", true, _encoding ) )
01754   {
01755     if ( document ) document->SetError ( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
01756 
01757     return 0;
01758   }
01759 
01760   if ( data )
01761   {
01762     data->Stamp ( p, _encoding );
01763     location = data->Cursor();
01764   }
01765 
01766   p += 5;
01767 
01768   version = "";
01769   encoding = "";
01770   standalone = "";
01771 
01772   while ( p && *p )
01773   {
01774     if ( *p == '>' )
01775     {
01776       ++p;
01777       return p;
01778     }
01779 
01780     p = SkipWhiteSpace ( p, _encoding );
01781 
01782     if ( StringEqual ( p, "version", true, _encoding ) )
01783     {
01784       TiXmlAttribute attrib;
01785       p = attrib.Parse ( p, data, _encoding );
01786       version = attrib.Value();
01787     }
01788     else if ( StringEqual ( p, "encoding", true, _encoding ) )
01789     {
01790       TiXmlAttribute attrib;
01791       p = attrib.Parse ( p, data, _encoding );
01792       encoding = attrib.Value();
01793     }
01794     else if ( StringEqual ( p, "standalone", true, _encoding ) )
01795     {
01796       TiXmlAttribute attrib;
01797       p = attrib.Parse ( p, data, _encoding );
01798       standalone = attrib.Value();
01799     }
01800     else
01801     {
01802       // Read over whatever it is.
01803       while ( p && *p && *p != '>' && !IsWhiteSpace ( *p ) )
01804         ++p;
01805     }
01806   }
01807 
01808   return 0;
01809 }
01810 
01811 bool TiXmlText::Blank() const
01812 {
01813   for ( unsigned i = 0; i < value.length(); i++ )
01814     if ( !IsWhiteSpace ( value[i] ) )
01815       return false;
01816 
01817   return true;
01818 }
01819