Back to index

lightning-sunbird  0.9+nobinonly
rdfparse.c
Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 
00039 #include "rdf-int.h"
00040 #include <stdio.h>
00041 
00042 
00043 char* error_string = NULL;
00044 int lineNumber = 0;
00045 
00046 static HashTable resourceHash = NULL;
00047 static RDF_Resource gURL = NULL;
00048 
00049 RDF_Resource 
00050 getResource (char* key, int createp) {
00051   RDF_Resource existing = (RDF_Resource) HashLookup(resourceHash, key);
00052   if (existing) {
00053     return existing;
00054   } else if (createp){
00055     existing = (RDF_Resource)fgetMem(sizeof(RDF_ResourceStruct));
00056     existing->url = fcopyString(key);
00057     HashAdd(resourceHash, existing->url, existing);
00058     if (!gURL) {
00059       gURL = (RDF_Resource)fgetMem(sizeof(RDF_ResourceStruct));
00060       gURL->url = fcopyString("URL");
00061       HashAdd(resourceHash, gURL->url, gURL);
00062     }
00063     remoteStoreAdd(NULL, existing, gURL, existing->url, RDF_STRING_TYPE,1); 
00064     return existing;
00065   } else return NULL;
00066 }
00067 
00068 char* 
00069 RDF_ResourceID (RDF_Resource u) {
00070   return u->url;
00071 }
00072 
00073 static char* MemBlock = 0;
00074 size_t allocated = 0;
00075 #define MEM_BLOCK_SIZE 10000
00076 
00077 char*
00078 fgetMem (size_t rsize) {
00079   char* ans = 0;
00080   size_t size = rsize + (4 - ldiv(rsize, 4).rem);  
00081   if (!MemBlock || (size >= (MEM_BLOCK_SIZE  - allocated))) {
00082          MemBlock = getMem(MEM_BLOCK_SIZE);
00083          allocated = 0;
00084   }
00085   ans = MemBlock;
00086   MemBlock = MemBlock + size;
00087   allocated = allocated + size;
00088   return ans;
00089 }
00090 
00091 void readRDFFile (char* file) {
00092   FILE* f = fopen(file, "r");      
00093   if (f) {
00094     RDFT rf = (RDFT)getRDFT(file, 1) ; 
00095     int ok = 1;
00096     char* buff  = (char*) malloc(100 * 1024);
00097     int len ;
00098     int i = 0;
00099     memset(buff, '\0', (100 * 1024));
00100     memset(rf, '\0', sizeof(RDF_FileStruct));
00101 
00102     rf->line = (char*)getMem(RDF_BUF_SIZE);
00103     rf->holdOver = (char*)getMem(RDF_BUF_SIZE);
00104     rf->depth = 1;
00105     rf->lastItem = rf->stack[0] ;
00106     while ((len = fread(buff, 1, (100 * 1024) -1, f)) > 0) {
00107       buff[len] = '\0';
00108       printf("[%i] ", i++);
00109       fflush(0);
00110       if (!(ok = parseNextRDFXMLBlobInt(rf, buff, len))) {
00111         printf("Error in RDF File\n");
00112       }
00113     }
00114     
00115     freeMem(rf->line);
00116     rf->line = NULL;
00117     freeMem(rf->holdOver);
00118     rf->holdOver = NULL;
00119     free(buff);
00120     printf("Finished reading %s\n", file); 
00121   } else  printf("Could not find %s\n", file);    
00122 }
00123 
00124 static HashTable rdftHash = NULL;
00125 
00126 RDFT
00127 getRDFT (char* key, int createp) {
00128   RDFT existing = (RDFT) HashLookup(rdftHash, key);
00129   if (existing) {
00130     return existing;
00131   } else if (createp){
00132     existing = (RDFT)getMem(sizeof(RDF_FileStruct));
00133     existing->url = fcopyString(key);
00134     HashAdd(rdftHash, existing->url, existing);
00135     return existing;
00136   } else return NULL;
00137 }
00138 
00139 void 
00140 rdf_init () {
00141   error_string = getMem(1000);
00142   resourceHash = NewHashTable((int)0x00000FFF);
00143   rdftHash = NewHashTable((int)0x00000FFF);
00144 }
00145 
00146 int
00147 rdf_DigestNewStuff (char* url, char* data, int len) {
00148   RDFT rf = (RDFT)getRDFT(url, 1) ; 
00149   int ok = 1;
00150   RDF_Resource u;
00151   unloadRDFT(rf);
00152   memset(rf, '\0', sizeof(RDF_FileStruct));
00153   rf->line = (char*)getMem(RDF_BUF_SIZE);
00154   rf->holdOver = (char*)getMem(RDF_BUF_SIZE);
00155   rf->depth = 1;
00156   rf->lastItem = rf->stack[0] ; 
00157   ok = parseNextRDFXMLBlobInt(rf, data, len);  
00158   /* if (!ok) unloadRDFT(rf); */
00159   freeMem(rf->line);
00160   rf->line = NULL;
00161   freeMem(rf->holdOver);
00162   rf->holdOver = NULL;
00163   return ok;
00164 }
00165 
00166 
00167 int
00168 startsWith (const char* pattern, const char* uuid) {
00169   int l1 = strlen(pattern);
00170   int l2 = strlen(uuid);
00171   int n;
00172   if (l2 < l1) return 0;
00173   for (n = 0; n < l1; n++) {
00174     if (pattern[n] != uuid[n]) return 0;
00175   } 
00176   return 1;
00177 }
00178 
00179 char* 
00180 getMem (size_t n) {
00181   return (char*) calloc(1, n);
00182 }
00183 
00184 void 
00185 freeMem(void* item) {
00186   free(item);
00187 }
00188 
00189 char 
00190 decodeEntityRef (char* string, int* stringIndexPtr, int len) {
00191   if (startsWith("lt;", string)) {
00192     *stringIndexPtr = *stringIndexPtr + 3;
00193     return '<';
00194   } else if (startsWith("gt;", string)) {
00195     *stringIndexPtr = *stringIndexPtr + 3;
00196     return '>';
00197   } else  if (startsWith("amp;", string)) {
00198     *stringIndexPtr = *stringIndexPtr + 4;
00199     return '&';
00200   } else return '&';
00201 }
00202 
00203 char *
00204 copyStringIgnoreWhiteSpace(char* string)
00205 {
00206    int len = strlen(string);
00207    char* buf = (char*)fgetMem(len + 1);
00208    int inWhiteSpace = 1;
00209    int buffIndex = 0;
00210    int stringIndex = 0;
00211 
00212    while (stringIndex < len) {
00213      char nextChar = *(string + stringIndex);
00214      int wsp = wsCharp(nextChar);
00215      if (!wsp) {
00216        if (nextChar == '&') {
00217          *(buf + buffIndex++) = decodeEntityRef(&string[stringIndex+1], 
00218                                                 &stringIndex, len-stringIndex);
00219        } else {
00220          *(buf + buffIndex++) = nextChar;
00221        }
00222        inWhiteSpace = 0;
00223      } else if (!inWhiteSpace) {
00224        *(buf + buffIndex++) = ' ';
00225        inWhiteSpace = 1;
00226      } else {
00227        inWhiteSpace = 1;
00228      }
00229      stringIndex++;
00230    }
00231 
00232    return buf;
00233 }
00234 
00235 char *
00236 getHref(char** attlist)
00237 {
00238        char* ans = getAttributeValue(attlist, "resource");
00239        if (!ans) ans = getAttributeValue(attlist, "rdf:resource");
00240        return ans;
00241 }
00242 
00243 
00244 char *
00245 getID(char** attlist)
00246 {
00247        char* ans = getAttributeValue(attlist, "id");
00248        if (!ans) ans = getAttributeValue(attlist, "about"); 
00249        if (!ans) ans = getAttributeValue(attlist, "rdf:about");
00250        return ans;
00251 }
00252  
00253 
00254 int 
00255 parseNextRDFXMLBlobInt(RDFT f, char* blob, int size) {
00256   int n, last, m;
00257   int somethingseenp = 0;
00258   n = last = 0; 
00259   while (n < size) {
00260     char c = blob[n];
00261     if ((c == '\n') || (c == '\r')) lineNumber++;
00262     m = 0;
00263     somethingseenp = 0;
00264     /*    memset(f->line, '\0', RDF_BUF_SIZE-1); */
00265     if (f->holdOver[0] != '\0') {
00266       memcpy(f->line, f->holdOver, strlen(f->holdOver));
00267       m = strlen(f->holdOver);
00268       somethingseenp = 1;
00269          f->holdOver[0] = '\0';
00270       /*    memset(f->holdOver, '\0', RDF_BUF_SIZE-1); */
00271     }   
00272     while ((n < size) && (wsCharp(c))  && (!somethingseenp)) {
00273       c = blob[++n]; 
00274       if ((c == '\n') || (c == '\r')) lineNumber++;
00275     }
00276     while ((m < RDF_BUF_SIZE-1) && (c != '<') && (c != '>')) {
00277       f->line[m] = c;
00278       m++;
00279       somethingseenp = (somethingseenp || (!(wsCharp(c))));
00280       n++;    
00281       if (n < size) c = blob[n]; 
00282       else break;
00283       if ((c == '\n') || (c == '\r')) lineNumber++;
00284     }
00285     f->line[m] = '\0';
00286     f->line[m+1] = '\0';
00287     if (c == '>') f->line[m] = c;
00288     n++;
00289     if (m > 0) {
00290       if ((c == '<') || (c == '>')) {
00291         last = n;
00292         if (c == '<') {
00293           f->holdOver[0] = '<'; 
00294           f->holdOver[1] = '\0';
00295               }
00296         if (somethingseenp == 1) {
00297           parseNextRDFToken(f, f->line);
00298         }
00299       } else if (size > last) {
00300         memcpy(f->holdOver, f->line, m);
00301         f->holdOver[m] = '\0';
00302       }
00303     } else if (c == '<') {
00304       f->holdOver[0] = '<';
00305       f->holdOver[1] = '\0';
00306     }
00307   }
00308   return(1);
00309 }
00310 
00311 char *
00312 getAttributeValue (char** attlist, char* elName)
00313 {
00314   size_t n = 0;
00315   if (!attlist) return NULL;
00316   while ((n < 2*MAX_ATTRIBUTES) && (*(attlist + n) != NULL)) {
00317     char* attname = *(attlist + n);
00318     char* base = strchr(attname, ':');
00319     if (base) attname = base + 1;
00320     if (strcmp(attname, elName) == 0) return *(attlist + n + 1);
00321     n = n + 2;
00322   }
00323   return NULL;
00324 }
00325 
00326 char* 
00327 copyString (char* str) {
00328   char* ans = getMem(strlen(str)+1);
00329   if (ans) {
00330     memcpy(ans, str, strlen(str));
00331     return ans;
00332   } else return NULL;
00333 }
00334 
00335 char* 
00336 fcopyString (char* str) {
00337   char* ans = fgetMem(strlen(str)+1);
00338   if (ans) {
00339     memcpy(ans, str, strlen(str));
00340     return ans;
00341   } else return NULL;
00342 }
00343 
00344 
00345 
00346 void
00347 addElementProps (char** attlist, char* elementName, RDFT f, RDF_Resource obj)
00348 {
00349   int count = 0;
00350   while (count < 2*MAX_ATTRIBUTES) {
00351     char* attName = attlist[count++];
00352     char* attValue = attlist[count++];
00353     char* baseName;
00354     if ((attName == NULL) || (attValue == NULL)) break;
00355     baseName  = strchr(attName, ':');
00356     if (baseName) attName = baseName + 1;
00357     if (startsWith("xmlns", attName)) {
00358       /* addNameSpace(attName, attValue, f); */
00359     } else if (!stringEquals(attName, "resource") && 
00360         !stringEquals(attName, "rdf:resource")  && 
00361         !stringEquals(attName, "about") && 
00362         !stringEquals(attName, "rdf:about") && 
00363         !stringEquals(attName, "tv") &&
00364         !stringEquals(attName, "id")) {
00365       remoteStoreAdd(f, obj, getResource(attName, 1), 
00366                    copyStringIgnoreWhiteSpace(attValue), 
00367                    RDF_STRING_TYPE, 1);
00368     }
00369   }
00370 }
00371         
00372 int
00373 parseNextRDFToken (RDFT f, char* token)
00374 {
00375   char* attlist[2*MAX_ATTRIBUTES+1];
00376   char* elementName;
00377 
00378   if (token[0] != '<')   {
00379     if ((f->status == EXPECTING_OBJECT) && (f->depth > 1)) {
00380       RDF_Resource u = f->stack[f->depth-2];
00381       RDF_Resource s = f->stack[f->depth-1];
00382       char* val      = copyStringIgnoreWhiteSpace(token);
00383       remoteStoreAdd(f, u, s, val , RDF_STRING_TYPE, 1);
00384          return 1;
00385     } else  {
00386       printf(error_string, "Did not expect \n\"%s\".\n Was expecting a tag.", token);
00387       return 0;
00388     } 
00389   } else if  (startsWith("<!--", token)) {
00390     return 1;
00391   } else if (token[1] == '?')  {
00392     return 1;
00393   } else if (token[1] == '/') {
00394     if ((f->status != EXPECTING_OBJECT) && (f->status != EXPECTING_PROPERTY)) {
00395       printf(error_string, "Did not expect %s. Something pretty screwed up", token);
00396       return 0;
00397     }
00398     if (f->depth > 0) f->depth--;
00399     f->status = (f->status == EXPECTING_OBJECT ? EXPECTING_PROPERTY : EXPECTING_OBJECT);
00400     return 1;
00401   } else if ((f->status == 0) && (startsWith("<RDF:RDF", token) || 
00402                                   startsWith("<RDF", token))) {
00403     f->status = EXPECTING_OBJECT;
00404     return 1;
00405   } else {
00406     int emptyElementp = (token[strlen(token)-2] == '/');  
00407     if ((f->status != EXPECTING_OBJECT) && (f->status != EXPECTING_PROPERTY)) return 1;
00408     if (!tokenizeElement(token, attlist, &elementName)) return 0;
00409     if (f->status == EXPECTING_OBJECT) {
00410       char* url = NULL;
00411       RDF_Resource obj;
00412       int count = 0;    
00413       url = getID(attlist);
00414       if (!url) {
00415         if (f->tagDepth > 2) {
00416           printf(error_string, "Unbalanced tags ");
00417         } else {
00418           printf(error_string, "Require a \"about\" attribute on %s", token);
00419         }
00420         return 0;
00421       }
00422       obj =  getResource(url, 1);
00423       addElementProps (attlist, elementName, f, obj) ;
00424       if (!stringEquals(elementName, "RDF:Description")) {
00425           RDF_Resource eln = getResource(elementName, 1);
00426           remoteStoreAdd(f, obj, getResource("type", 1), 
00427                        eln, RDF_RESOURCE_TYPE, 
00428                        1);        
00429       }
00430       if (f->depth > 1) {
00431         remoteStoreAdd(f, f->stack[f->depth-2], f->stack[f->depth-1], obj, 
00432                      RDF_RESOURCE_TYPE, 1);
00433       }
00434       if (!emptyElementp) {
00435         f->stack[f->depth++] = obj;
00436         f->status = EXPECTING_PROPERTY;
00437       }
00438     } else if (f->status == EXPECTING_PROPERTY) {
00439       char* url;
00440       RDF_Resource obj;
00441       int count = 0;
00442       url = getHref(attlist) ;      
00443       if (url) {
00444         RDF_Resource eln = getResource(elementName, 1);      
00445         obj =  getResource(url, 1);        
00446         addElementProps (attlist, elementName, f, obj) ;     
00447         remoteStoreAdd(f, f->stack[f->depth-1], eln, obj, RDF_RESOURCE_TYPE,  1);
00448         /* printf("%s %s %s\n", RDF_ResourceID(f->stack[f->depth-1]), 
00449                RDF_ResourceID(eln), url); */
00450       } 
00451       if (!emptyElementp) {
00452         f->stack[f->depth++] = getResource(elementName, 1);
00453         f->status = EXPECTING_OBJECT;
00454       }
00455     }
00456     return 1;
00457   }
00458 }      
00459 
00460 
00461 
00462 int
00463 tokenizeElement (char* attr, char** attlist, char** elementName)
00464 {
00465   size_t n = 1;
00466   size_t s = strlen(attr); 
00467   char c ;
00468   size_t m = 0;
00469   size_t atc = 0;
00470   char* base;
00471   int emptyTagp =  (attr[s-2] == '/');
00472   int inAttrNamep = 1;
00473   c = attr[n++]; 
00474   while (wsCharp(c)) {
00475     c = attr[n++];
00476   }
00477   *elementName = &attr[n-1];
00478   while (n < s) {
00479     if (wsCharp(c)) break;
00480     c = attr[n++];
00481   }
00482   attr[n-1] = '\0';
00483   while (atc < 2*MAX_ATTRIBUTES+1) {*(attlist + atc++) = NULL;}
00484   atc = 0;
00485   s = (emptyTagp ? s-2 : s-1);
00486   while (n < s) {
00487     int attributeOpenStringSeenp = 0;
00488     m = 0;
00489     c = attr[n++];
00490     while ((n <= s) && (atc < 2*MAX_ATTRIBUTES)) {
00491       if (inAttrNamep && (m > 0) && (wsCharp(c) || (c == '='))) {
00492        attr[n-1] = '\0';
00493        *(attlist + atc++) = &attr[n-m-1];
00494        break;
00495       }
00496       if  (!inAttrNamep && attributeOpenStringSeenp && (c == '"')) {
00497        attr[n-1] = '\0';
00498        *(attlist + atc++) = &attr[n-m-1];
00499        break;
00500       }
00501       if (inAttrNamep) {
00502        if ((m > 0) || (!wsCharp(c))) m++;
00503       } else {
00504        if (c == '"') {
00505          attributeOpenStringSeenp = 1;
00506        } else {
00507          if ((m > 0) || (!(wsCharp(c)))) m++;
00508        }
00509       }
00510       c = attr[n++];
00511     }
00512     inAttrNamep = (inAttrNamep ? 0 : 1);
00513   }
00514   base = strchr(*elementName, ':');
00515   if (base) *elementName = base+1;
00516   return 1;
00517 }