Back to index

wims  3.65+svn20090927
DataSheetStream.java
Go to the documentation of this file.
00001 /*
00002     WIMSchem Elements: Chemistry molecular diagram drawing tool.
00003     
00004     (c) 2008 Dr. Alex M. Clark
00005     
00006     Released as GNUware, under the Gnu Public License (GPL)
00007     
00008     See www.gnu.org for details.
00009 */
00010 
00011 package WIMSchem.ds;
00012 
00013 import WIMSchem.*;
00014 
00015 import java.io.*;
00016 import java.util.*;
00017 
00018 /*
00019     Readers and writers of the DataSheet format.
00020     
00021     The native format is XML (or at least, the subset of XML used by the TrivialDOM class), and is structured as follows:
00022     
00023        <?xml version="1.0" encoding="UTF-8"?>
00024        <DataSheet>
00025            <Header ncols=~ nrows=~>
00026               <Column id="1" name=~ type=~>{description}</Column>
00027               <Column id="2" name=~ type=~>{description}</Column>
00028               ...
00029               <Column id="{ncols}" name=~ type=~>{description}</Column>
00030            </Header>
00031            <Content>
00032               <Row id="1">
00033                   <Cell id="1">{data}</Cell>
00034                   <Cell id="2">{data}</Cell>
00035                   ...
00036                   <Cell id="{ncols}">{data}</Cell>
00037               </Row>
00038               ...
00039               <Row id="{nrows}">
00040                   ...
00041               </Row>
00042            </Content>
00043        </DataSheet>
00044            
00045     All indices are 1-based.
00046               
00047 */
00048 
00049 public class DataSheetStream
00050 {
00051     // read-ahead code to try to figure out a filetype
00052 
00053     // returns true if stream is the native datasheet format; preserves file position
00054     public static boolean ExamineIsXMLDS(FileInputStream istr)
00055     {
00056        boolean ret=false;
00057        try
00058        {
00059            long lastpos=istr.getChannel().position();
00060            BufferedReader rdr=new BufferedReader(new InputStreamReader(istr));
00061            ret=ExamineIsXMLDS(rdr);
00062            istr.getChannel().position(lastpos);
00063        }
00064        catch (IOException e) {}
00065        return ret;
00066     }
00067     
00068     // as above, except this version loses the stream position
00069     public static boolean ExamineIsXMLDS(BufferedReader rdr)
00070     {
00071        try
00072        {
00073            for (int n=0;n<2;n++)
00074            {
00075               String str=rdr.readLine();
00076               if (str==null) break;
00077               if (str.startsWith("<DataSheet>")) return true;
00078            }
00079        }
00080        catch (IOException e) {}
00081        return false;
00082     }
00083     
00084     // returns true if stream appears to be an MDL SDfile; preserves file position
00085     public static boolean ExamineIsMDLSDF(FileInputStream istr)
00086     {
00087        boolean ret=false;
00088        try
00089        {
00090            long lastpos=istr.getChannel().position();
00091            BufferedReader rdr=new BufferedReader(new InputStreamReader(istr));
00092            ret=ExamineIsMDLSDF(rdr);
00093            istr.getChannel().position(lastpos);
00094        }
00095        catch (IOException e) {}
00096        return ret;
00097     }
00098 
00099     // as above, except this version loses the stream position
00100     public static boolean ExamineIsMDLSDF(BufferedReader rdr)
00101     {
00102        try
00103        {
00104            for (int n=0;n<3000;n++)
00105            {
00106               String str=rdr.readLine();
00107               if (str==null) break;
00108               if (str.compareTo("$$$$")==0 || str.compareTo("M  END")==0) return true;
00109            }
00110        }
00111        catch (IOException e) {}
00112        return false;
00113      }
00114 
00115    // reading of datasheets from the WIMSchem XML format
00116 
00117     public static DataSheet ReadXML(InputStream istr) throws IOException 
00118     {
00119        return ReadXML(new BufferedReader(new InputStreamReader(istr)));
00120     }
00121     public static DataSheet ReadXML(BufferedReader in) throws IOException
00122     {
00123        TrivialDOM xml=TrivialDOM.ReadXML(in);
00124 
00125        if (xml.Document().NodeName().compareTo("DataSheet")!=0) 
00126            throw new IOException("Input stream is XML, but the root node is not <DataSheet>.");
00127        
00128        DataSheet ds=new DataSheet();
00129        
00130        // do a precursory check
00131        TrivialDOM.Node doc=xml.Document(),header=null,content=null;
00132        for (int n=0;n<doc.NumChildren();n++) if (doc.ChildType(n)==TrivialDOM.TYPE_NODE)
00133        {
00134            TrivialDOM.Node node=doc.GetChildNode(n);
00135            if (node.NodeName().compareTo("Header")==0) header=node;
00136            if (node.NodeName().compareTo("Content")==0) content=node;
00137        }
00138        if (header==null) throw new IOException("XML document lacks a <Header> element.");
00139        if (content==null) throw new IOException("XML document lacks a <Content> element.");
00140        
00141        int ncols=Utils.safeInt(header.Attribute("ncols"),-1),nrows=Utils.safeInt(header.Attribute("nrows"),-1);
00142        if (ncols<0 || ncols>5000) throw new IOException("Header@ncols attribute absent or improperly specified.");
00143        if (nrows<0) throw new IOException("Header@nrows attribute absent or improperly specified.");
00144        
00145        // put the columns into an array, then create in datasheet
00146        String[] colName=new String[ncols],colDescr=new String[ncols];
00147        int[] colType=new int[ncols];
00148        for (int n=0;n<ncols;n++) colName[n]=null; // means unspecified in source
00149        for (int n=0;n<header.NumChildren();n++) if (header.ChildType(n)==TrivialDOM.TYPE_NODE)
00150        {
00151            TrivialDOM.Node node=header.GetChildNode(n);
00152            if (node.NodeName().compareTo("Column")!=0) continue;
00153            int id=Utils.safeInt(node.Attribute("id"),0);
00154            if (id<1 || id>ncols) throw new IOException("Column@id out of range.");
00155            String strName=node.Attribute("name"),strType=node.Attribute("type");
00156            if (strName==null) throw new IOException("Column name not specified.");
00157            if (strType==null) throw new IOException("Column type not specified.");
00158            int type=0;
00159            if (strType.compareTo("molecule")==0) type=DataSheet.COLTYPE_MOLECULE;
00160            else if (strType.compareTo("string")==0) type=DataSheet.COLTYPE_STRING;
00161            else if (strType.compareTo("integer")==0) type=DataSheet.COLTYPE_INTEGER;
00162            else if (strType.compareTo("real")==0) type=DataSheet.COLTYPE_REAL;
00163            else if (strType.compareTo("boolean")==0) type=DataSheet.COLTYPE_BOOLEAN;
00164            else throw new IOException("Coltype type ["+strType+"] not recognised.");
00165            
00166            colName[id-1]=strName;
00167            colType[id-1]=type;
00168            colDescr[id-1]=node.GetText();
00169        }
00170        for (int n=0;n<ncols;n++) if (colName[n]==null) throw new IOException("Column id#"+(n+1)+" is not defined.");
00171        for (int n=0;n<ncols;n++) ds.AppendColumn(colName[n],colType[n],colDescr[n]);
00172        
00173        // append a row for each claimed case, then fill in the data as it is encountered
00174        for (int n=0;n<nrows;n++) ds.AppendRow();
00175        for (int i=0;i<content.NumChildren();i++) if (content.ChildType(i)==TrivialDOM.TYPE_NODE)
00176        {
00177            TrivialDOM.Node row=content.GetChildNode(i);
00178            if (row.NodeName().compareTo("Row")!=0) continue;
00179            int rid=Utils.safeInt(row.Attribute("id"),0);
00180            if (rid<1 || rid>nrows) throw new IOException("Row@id out of range.");
00181            
00182            for (int j=0;j<row.NumChildren();j++) if (row.ChildType(j)==TrivialDOM.TYPE_NODE)
00183            {
00184               TrivialDOM.Node cell=row.GetChildNode(j);
00185               if (cell.NodeName().compareTo("Cell")!=0) continue;
00186               int cid=Utils.safeInt(cell.Attribute("id"),0);
00187               if (cid<1 || cid>ncols) throw new IOException("Cell@id out of range.");
00188               
00189               String data=cell.GetText();
00190               int type=colType[cid-1];
00191               
00192               if (type==DataSheet.COLTYPE_MOLECULE)
00193               {
00194                   Molecule mol=null;
00195                   try {mol=MoleculeStream.ReadUnknown(new BufferedReader(new StringReader(data)));}
00196                   catch (IOException e) {} // leave it null
00197                   ds.SetMolecule(rid-1,cid-1,mol);
00198               }
00199               else if (type==DataSheet.COLTYPE_STRING)
00200               {
00201                   ds.SetString(rid-1,cid-1,data);
00202               }
00203               else if (type==DataSheet.COLTYPE_INTEGER)
00204               {
00205                   try
00206                   {
00207                      int v=new Integer(data).intValue();
00208                      ds.SetInteger(rid-1,cid-1,v);
00209                   }
00210                   catch (NumberFormatException e) {ds.SetToNull(rid-1,cid-1);}
00211               }
00212               else if (type==DataSheet.COLTYPE_REAL)
00213               {
00214                   try
00215                   {
00216                      double v=new Double(data).doubleValue();
00217                      ds.SetReal(rid-1,cid-1,v);
00218                   }
00219                   catch (NumberFormatException e) {ds.SetToNull(rid-1,cid-1);}
00220               }
00221               else if (type==DataSheet.COLTYPE_BOOLEAN)
00222               {
00223                   ds.SetBoolean(rid-1,cid-1,data.toLowerCase().compareTo("true")==0) ;
00224               }
00225            }
00226        }
00227        
00228        return ds;
00229     }
00230 
00231     // reading of datasheets from the MDL SD file format
00232 
00233     public static DataSheet ReadSDF(InputStream istr) throws IOException 
00234     {
00235        return ReadSDF(new BufferedReader(new InputStreamReader(istr)));
00236     }
00237     public static DataSheet ReadSDF(BufferedReader in) throws IOException
00238     {
00239        DataSheet ds=new DataSheet();
00240        
00241        ds.AppendColumn("mol",DataSheet.COLTYPE_MOLECULE,"Molecule");
00242 
00243        ArrayList<String> entry=new ArrayList<String>();
00244 
00245        while (true)
00246        {
00247            String line=in.readLine();
00248            if (line==null) break;
00249            if (!line.startsWith("$$$$")) {entry.add(line); continue;}
00250            
00251            int rn=ds.AppendRow();
00252            
00253            StringBuffer sb=new StringBuffer();
00254            int pos=0;
00255            while (pos<entry.size())
00256            {
00257               line=entry.get(pos);
00258               if (line.startsWith("> ")) break;
00259               sb.append(line+"\n"); 
00260               pos++; 
00261               if (line.startsWith("M  END")) {break;}
00262            }
00263 
00264            Molecule mol=null;
00265            try {mol=MoleculeStream.ReadMDLMOL(new BufferedReader(new StringReader(sb.toString())));}
00266            catch (IOException e) {} // leave it null
00267            if (mol!=null) ds.SetMolecule(rn,0,mol);
00268            
00269            for (;pos+2<entry.size();pos+=3)
00270            {
00271               String key=entry.get(pos),val=entry.get(pos+1);
00272               if (!key.startsWith(">")) continue;
00273               int z=key.indexOf("<"); if (z<0) continue;
00274               key=key.substring(z+1);
00275               z=key.indexOf(">"); if (z<0) continue;
00276               key=key.substring(0,z);
00277               if (key.length()==0) continue;
00278               
00279               int type=val.length() > 0 ? DataSheet.COLTYPE_STRING : DataSheet.COLTYPE_INTEGER;
00280               double dval=0;
00281               int ival=0;
00282               try
00283               {
00284                   dval=Double.parseDouble(val);
00285                   ival=(int)Math.round(dval);
00286                   type=dval==ival ? DataSheet.COLTYPE_INTEGER : DataSheet.COLTYPE_REAL;
00287               }
00288               catch (NumberFormatException e) {} // stays string
00289 
00290               int cn=-1;
00291               for (int n=0;n<ds.NumCols();n++) if (ds.ColName(n).compareTo(key)==0) {cn=n; break;}
00292               if (cn<0) cn=ds.AppendColumn(key,type,"");
00293               
00294               int curType=ds.ColType(cn);
00295               if (val.length()==0) ds.SetToNull(rn,cn);
00296               else if (curType==DataSheet.COLTYPE_STRING) ds.SetString(rn,cn,val);
00297               else if (curType==DataSheet.COLTYPE_REAL)
00298               {
00299                   if (type==DataSheet.COLTYPE_STRING)
00300                   {
00301                      if (ds.ChangeColumnType(cn,DataSheet.COLTYPE_STRING,false)) ds.SetString(rn,cn,val);
00302                   }
00303                   else ds.SetReal(rn,cn,dval);
00304               }
00305               else if (curType==DataSheet.COLTYPE_INTEGER)
00306               {
00307                   if (type==DataSheet.COLTYPE_STRING)
00308                   {
00309                      if (ds.ChangeColumnType(cn,DataSheet.COLTYPE_STRING,false)) ds.SetString(rn,cn,val);
00310                   }
00311                   else if (type==DataSheet.COLTYPE_REAL)
00312                   {
00313                      if (ds.ChangeColumnType(cn,DataSheet.COLTYPE_REAL,false)) ds.SetReal(rn,cn,dval);
00314                   }
00315                   else ds.SetInteger(rn,cn,ival);
00316               }
00317            }
00318         
00319            entry.clear();
00320        }
00321        
00322        return ds;
00323     }
00324 
00325     // writing of datasheets to the WIMSchem XML format
00326     
00327     public static void WriteXML(OutputStream ostr,DataSheet ds) throws IOException
00328     {
00329        WriteXML(new BufferedWriter(new OutputStreamWriter(ostr)),ds);
00330     }
00331     public static void WriteXML(BufferedWriter out,DataSheet ds) throws IOException
00332     {
00333        TrivialDOM xml=new TrivialDOM("DataSheet");
00334 
00335        int ncols=ds.NumCols(),nrows=ds.NumRows();
00336 
00337        TrivialDOM.Node header=xml.Document().AppendNode("Header");
00338        header.SetAttribute("ncols",ds.NumCols()+"");
00339        header.SetAttribute("nrows",ds.NumRows()+"");
00340        for (int n=0;n<ncols;n++)
00341        {
00342            TrivialDOM.Node col=header.AppendNode("Column");
00343            col.SetAttribute("id",String.valueOf(n+1));
00344            col.SetAttribute("name",ds.ColName(n));
00345            int type=ds.ColType(n);
00346            if (type==DataSheet.COLTYPE_MOLECULE) col.SetAttribute("type","molecule");
00347            else if (type==DataSheet.COLTYPE_STRING) col.SetAttribute("type","string");
00348            else if (type==DataSheet.COLTYPE_INTEGER) col.SetAttribute("type","integer");
00349            else if (type==DataSheet.COLTYPE_REAL) col.SetAttribute("type","real");
00350            else if (type==DataSheet.COLTYPE_BOOLEAN) col.SetAttribute("type","boolean");
00351 
00352            col.SetText(ds.ColDescr(n),false);
00353        }
00354 
00355        TrivialDOM.Node content=xml.Document().AppendNode("Content");
00356        for (int i=0;i<nrows;i++)
00357        {
00358            TrivialDOM.Node row=content.AppendNode("Row");
00359            row.SetAttribute("id",String.valueOf(i+1));
00360            for (int j=0;j<ncols;j++)
00361            {
00362               TrivialDOM.Node col=row.AppendNode("Cell");
00363               col.SetAttribute("id",String.valueOf(j+1));
00364               int type=ds.ColType(j);
00365               if (ds.IsNull(i,j)) {} // do nothing (stays blank)
00366               else if (type==DataSheet.COLTYPE_MOLECULE)
00367               {
00368                   try
00369                   {
00370                      StringWriter sw=new StringWriter();
00371                      BufferedWriter bw=new BufferedWriter(sw);
00372                      MoleculeStream.WriteNative(bw,ds.GetMolecule(i,j));
00373                      col.SetText(sw.toString(),true);
00374                   }
00375                   catch (IOException e) {} // entry stays blank
00376               }
00377               else if (type==DataSheet.COLTYPE_STRING) col.SetText(ds.GetString(i,j),true);
00378               else if (type==DataSheet.COLTYPE_INTEGER) col.SetText(String.valueOf(ds.GetInteger(i,j)),false);
00379               else if (type==DataSheet.COLTYPE_REAL) col.SetText(String.valueOf(ds.GetReal(i,j)),false);
00380               else if (type==DataSheet.COLTYPE_BOOLEAN) col.SetText(ds.GetBoolean(i,j) ? "true" : "false",false);
00381            }
00382        }
00383 
00384        TrivialDOM.WriteXML(out,xml);
00385     }
00386     
00387     // writing of datasheets to the MDL SD file format
00388     
00389     public static void WriteSDF(OutputStream ostr,DataSheet ds) throws IOException
00390     {
00391        WriteSDF(new BufferedWriter(new OutputStreamWriter(ostr)),ds);
00392     }
00393     public static void WriteSDF(BufferedWriter out,DataSheet ds) throws IOException
00394     {
00395        int molfld=-1;
00396        for (int n=0;n<ds.NumCols();n++) if (ds.ColType(n)==DataSheet.COLTYPE_MOLECULE) {molfld=n; break;}
00397        
00398        for (int i=0;i<ds.NumRows();i++)
00399        {
00400            if (molfld>=0) if (!ds.IsNull(i,molfld))
00401            {
00402               MoleculeStream.WriteMDLMOL(out,ds.GetMolecule(i,molfld));
00403            }
00404            for (int j=0;j<ds.NumCols();j++) if (ds.ColType(j)!=DataSheet.COLTYPE_MOLECULE && !ds.IsNull(i,j))
00405            {
00406               String line="";
00407               
00408               if (ds.ColType(j)==DataSheet.COLTYPE_STRING) line=ds.GetString(i,j);
00409               else if (ds.ColType(j)==DataSheet.COLTYPE_INTEGER) line=String.valueOf(ds.GetInteger(i,j));
00410               else if (ds.ColType(j)==DataSheet.COLTYPE_REAL) line=String.valueOf(ds.GetReal(i,j));
00411               else if (ds.ColType(j)==DataSheet.COLTYPE_BOOLEAN) line=ds.GetBoolean(i,j) ? "true" : "false";
00412 
00413               if (line.length()==0) continue;
00414               String[] bits=line.split("\n");
00415               boolean anything=false;
00416               for (int n=0;n<bits.length;n++) if (bits[n].length()>0) anything=true;
00417               if (!anything) continue;
00418               
00419               out.write("> <"+ds.ColName(j)+">\n");
00420               for (int n=0;n<bits.length;n++) if (bits[n].length()>0) 
00421               {
00422                   if (bits[n].length()>78) bits[n]=bits[n].substring(0,78); // tuff
00423                   out.write(bits[n]+"\n");
00424               }
00425               out.write("\n");
00426            }
00427            
00428            out.write("$$$$\n");
00429        }
00430        
00431        out.flush();
00432     }
00433 }