ofx_preproc.cpp

Go to the documentation of this file.
00001 /***************************************************************************
00002           ofx_preproc.cpp 
00003                              -------------------
00004     copyright            : (C) 2002 by Benoit Gr�oir
00005     email                : bock@step.polymtl.ca
00006 ***************************************************************************/
00012 /***************************************************************************
00013  *                                                                         *
00014  *   This program is free software; you can redistribute it and/or modify  *
00015  *   it under the terms of the GNU General Public License as published by  *
00016  *   the Free Software Foundation; either version 2 of the License, or     *
00017  *   (at your option) any later version.                                   *
00018  *                                                                         *
00019  ***************************************************************************/
00020 #include "../config.h"
00021 #include <iostream>
00022 #include <fstream>
00023 #include <stdlib.h>
00024 #include <stdio.h>
00025 #include <string>
00026 #include "ParserEventGeneratorKit.h"
00027 #include "libofx.h"
00028 #include "messages.hh"
00029 #include "ofx_sgml.hh"
00030 #include "ofc_sgml.hh"
00031 #include "ofx_preproc.hh"
00032 #include "ofx_utilities.hh"
00033 #ifdef HAVE_ICONV
00034 #include <iconv.h>
00035 #endif
00036 
00037 #ifdef OS_WIN32
00038 # include "win32.hh"
00039 #endif
00040 
00041 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
00042 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
00043 
00044 using namespace std;
00048 #ifdef MAKEFILE_DTD_PATH
00049 const int DTD_SEARCH_PATH_NUM = 4;
00050 #else
00051 const int DTD_SEARCH_PATH_NUM = 3;
00052 #endif
00053  
00057 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] = { 
00058 #ifdef MAKEFILE_DTD_PATH
00059   MAKEFILE_DTD_PATH , 
00060 #endif
00061   "/usr/local/share/libofx/dtd/", 
00062   "/usr/share/libofx/dtd/", 
00063   "~/"};
00064 const unsigned int READ_BUFFER_SIZE = 1024;
00065 
00070 CFCT int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
00071   {
00072   LibofxContext *libofx_context;
00073   bool ofx_start=false;
00074   bool ofx_end=false;
00075 
00076   ifstream input_file;
00077   ofstream tmp_file;
00078   char buffer[READ_BUFFER_SIZE];
00079   char iconv_buffer[READ_BUFFER_SIZE * 2];
00080   string s_buffer;
00081   char *filenames[3];
00082   char tmp_filename[256];
00083 #ifdef HAVE_ICONV
00084         iconv_t conversion_descriptor;
00085 #endif
00086   libofx_context=(LibofxContext*)ctx;
00087 
00088   if(p_filename!=NULL&&strcmp(p_filename,"")!=0)
00089     {
00090     message_out(DEBUG, string("ofx_proc_file():Opening file: ")+ p_filename);
00091     
00092     input_file.open(p_filename);
00093     mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
00094     mkstemp(tmp_filename);
00095     tmp_file.open(tmp_filename);
00096 
00097     message_out(DEBUG,"ofx_proc_file(): Creating temp file: "+string(tmp_filename));
00098     if(!input_file){
00099       message_out(ERROR,"ofx_proc_file():Unable to open the input file "+string(p_filename));
00100     }
00101     else if(!tmp_file){
00102       message_out(ERROR,"ofx_proc_file():Unable to open the output file "+string(tmp_filename));
00103     }
00104     else
00105       {
00106         int header_separator_idx;
00107         string header_name;
00108         string header_value;
00109         string ofx_encoding;
00110         string ofx_charset;
00111         do {
00112           input_file.getline(buffer, sizeof(buffer),'\n');
00113           //cout<<buffer<<"\n";
00114           s_buffer.assign(buffer);
00115           //cout<<"input_file.gcount(): "<<input_file.gcount()<<" sizeof(buffer): "<<sizeof(buffer)<<endl;
00116           if(input_file.gcount()<(sizeof(buffer)-1))
00117             {
00118               s_buffer.append("\n");
00119             }
00120           else if( !input_file.eof()&&input_file.fail())
00121             {
00122               input_file.clear();
00123             }
00124           int ofx_start_idx;
00125           if (ofx_start==false &&
00126               (
00127                (libofx_context->currentFileType()==OFX&&
00128                 ((ofx_start_idx=s_buffer.find("<OFX>"))!=
00129                  string::npos||(ofx_start_idx=s_buffer.find("<ofx>"))!=string::npos))
00130                || (libofx_context->currentFileType()==OFC&&
00131                    ((ofx_start_idx=s_buffer.find("<OFC>"))!=string::npos||
00132                     (ofx_start_idx=s_buffer.find("<ofc>"))!=string::npos))
00133               )
00134              )
00135             {
00136               ofx_start=true;
00137               s_buffer.erase(0,ofx_start_idx);//Fix for really broken files that don't have a newline after the header.
00138               message_out(DEBUG,"ofx_proc_file():<OFX> or <OFC> has been found");
00139 #ifdef HAVE_ICONV
00140               string fromcode;
00141               string tocode; 
00142               if(ofx_encoding.compare("USASCII")==0){
00143                 if(ofx_charset.compare("ISO-8859-1")==0){
00144                   fromcode="ISO-8859-1";
00145                 }
00146                 else if(ofx_charset.compare("1252")==0){
00147                   fromcode="CP1252";
00148                 }
00149                 else if(ofx_charset.compare("NONE")==0){
00150                   fromcode=LIBOFX_DEFAULT_INPUT_ENCODING;
00151                 }
00152               }
00153               else if(ofx_encoding.compare("USASCII")==0) {
00154                 fromcode="UTF-8";
00155               }
00156               else
00157                 {
00158                   fromcode=LIBOFX_DEFAULT_INPUT_ENCODING;
00159                 }
00160               tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
00161               message_out(DEBUG,"ofx_proc_file(): Setting up iconv for fromcode: "+fromcode+", tocode: "+tocode);
00162               conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
00163 #endif
00164             }
00165           else {
00166             //We are still in the headers
00167             if ((header_separator_idx=s_buffer.find(':')) != string::npos) {
00168               //Header processing
00169               header_name.assign(s_buffer.substr(0,header_separator_idx));
00170               header_value.assign(s_buffer.substr(header_separator_idx+1));
00171               message_out(DEBUG,"ofx_proc_file():Header: "+header_name+" with value: "+header_value+" has been found");
00172               if(header_name.compare("ENCODING")==0) {
00173                 ofx_encoding.assign(header_value);
00174               }
00175               if(header_name.compare("CHARSET")==0) {
00176                 ofx_charset.assign(header_value);
00177               }
00178             }
00179           }
00180 
00181           if(ofx_start==true && ofx_end==false){
00182             s_buffer=sanitize_proprietary_tags(s_buffer);
00183             //cout<< s_buffer<<"\n";
00184 #ifdef HAVE_ICONV
00185             memset(iconv_buffer,0,READ_BUFFER_SIZE * 2);
00186             size_t inbytesleft = strlen(s_buffer.c_str());
00187             size_t outbytesleft = READ_BUFFER_SIZE * 2 - 1;
00188 #ifdef OS_WIN32
00189             const char * inchar = (const char *)s_buffer.c_str();
00190 #else
00191             char * inchar = (char *)s_buffer.c_str();
00192 #endif
00193             char * outchar = iconv_buffer;
00194             int iconv_retval = iconv (conversion_descriptor,
00195                     &inchar, &inbytesleft,
00196                    &outchar, &outbytesleft);
00197             if(iconv_retval==-1){
00198               message_out(ERROR,"ofx_proc_file(): Conversion error");
00199             }
00200             s_buffer = iconv_buffer;
00201 #endif
00202               tmp_file.write(s_buffer.c_str(), s_buffer.length());
00203           }
00204           
00205           if (ofx_start==true &&
00206               (
00207                (libofx_context->currentFileType()==OFX &&
00208                 ((ofx_start_idx=s_buffer.find("</OFX>"))!=string::npos ||
00209                  (ofx_start_idx=s_buffer.find("</ofx>"))!=string::npos))
00210                || (libofx_context->currentFileType()==OFC &&
00211                    ((ofx_start_idx=s_buffer.find("</OFC>"))!=string::npos ||
00212                     (ofx_start_idx=s_buffer.find("</ofc>"))!=string::npos))
00213               )
00214              )
00215             {
00216               ofx_end=true;
00217               message_out(DEBUG,"ofx_proc_file():</OFX> or </OFC>  has been found");
00218             }
00219 
00220         } while(!input_file.eof()&&!input_file.bad());
00221       }
00222     input_file.close();
00223     tmp_file.close();
00224 #ifdef HAVE_ICONV
00225               iconv_close(conversion_descriptor);
00226 #endif
00227     char filename_openspdtd[255];
00228     char filename_dtd[255];
00229     char filename_ofx[255];
00230     strncpy(filename_openspdtd,find_dtd(ctx, OPENSPDCL_FILENAME).c_str(),255);//The opensp sgml dtd file
00231     if(libofx_context->currentFileType()==OFX)
00232       {
00233         strncpy(filename_dtd,find_dtd(ctx, OFX160DTD_FILENAME).c_str(),255);//The ofx dtd file
00234       }
00235     else if(libofx_context->currentFileType()==OFC)
00236       {
00237         strncpy(filename_dtd,find_dtd(ctx, OFCDTD_FILENAME).c_str(),255);//The ofc dtd file
00238       }
00239     else
00240       {
00241         message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00242       }
00243 
00244     if((string)filename_dtd!="" && (string)filename_openspdtd!="")
00245       {
00246         strncpy(filename_ofx,tmp_filename,255);//The processed ofx file
00247         filenames[0]=filename_openspdtd;
00248         filenames[1]=filename_dtd;
00249         filenames[2]=filename_ofx;
00250         if(libofx_context->currentFileType()==OFX)
00251           {
00252             ofx_proc_sgml(libofx_context, 3,filenames);
00253           }
00254         else if(libofx_context->currentFileType()==OFC)
00255           {
00256             ofc_proc_sgml(libofx_context, 3,filenames);
00257           }
00258         else
00259           {
00260             message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00261           }
00262         if(remove(tmp_filename)!=0)
00263           {
00264             message_out(ERROR,"ofx_proc_file(): Error deleting temporary file "+string(tmp_filename));
00265           }
00266       }
00267     else
00268       {
00269         message_out(ERROR,"ofx_proc_file(): FATAL: Missing DTD, aborting");
00270       }
00271   }
00272   else{
00273     message_out(ERROR,"ofx_proc_file():No input file specified");
00274   }
00275   return 0;
00276 }
00277 
00278 
00279 
00280 CFCT int libofx_proc_buffer(LibofxContextPtr ctx,
00281                             const char *s, unsigned int size){
00282   ofstream tmp_file;
00283   string s_buffer;
00284   char *filenames[3];
00285   char tmp_filename[256];
00286   ssize_t pos;
00287   LibofxContext *libofx_context;
00288 
00289   libofx_context=(LibofxContext*)ctx;
00290 
00291   if (size==0) {
00292     message_out(ERROR,
00293                 "ofx_proc_file(): bad size");
00294     return -1;
00295   }
00296   s_buffer=string(s, size);
00297 
00298   mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
00299   mkstemp(tmp_filename);
00300   tmp_file.open(tmp_filename);
00301 
00302   message_out(DEBUG,"ofx_proc_file(): Creating temp file: "+string(tmp_filename));
00303   if(!tmp_file){
00304     message_out(ERROR,"ofx_proc_file():Unable to open the output file "+string(tmp_filename));
00305     return -1;
00306   }
00307 
00308   if (libofx_context->currentFileType()==OFX) {
00309     pos=s_buffer.find("<OFX>");
00310     if (pos==string::npos)
00311       pos=s_buffer.find("<ofx>");
00312   }
00313   else if (libofx_context->currentFileType()==OFC) {
00314     pos=s_buffer.find("<OFC>");
00315     if (pos==string::npos)
00316       pos=s_buffer.find("<ofc>");
00317   }
00318   else {
00319     message_out(ERROR,"ofx_proc(): unknown file type");
00320     return -1;
00321   }
00322   if (pos==string::npos || pos > s_buffer.size()) {
00323     message_out(ERROR,"ofx_proc():<OFX> has not been found");
00324     return -1;
00325   }
00326   else {
00327     // erase everything before the OFX tag
00328     s_buffer.erase(0, pos);
00329     message_out(DEBUG,"ofx_proc_file():<OF?> has been found");
00330   }
00331 
00332   if (libofx_context->currentFileType()==OFX) {
00333     pos=s_buffer.find("</OFX>");
00334     if (pos==string::npos)
00335       pos=s_buffer.find("</ofx>");
00336   }
00337   else if (libofx_context->currentFileType()==OFC) {
00338     pos=s_buffer.find("</OFC>");
00339     if (pos==string::npos)
00340       pos=s_buffer.find("</ofc>");
00341   }
00342   else {
00343     message_out(ERROR,"ofx_proc(): unknown file type");
00344     return -1;
00345   }
00346 
00347   if (pos==string::npos || pos > s_buffer.size()) {
00348     message_out(ERROR,"ofx_proc():</OF?> has not been found");
00349     return -1;
00350   }
00351   else {
00352     // erase everything after the /OFX tag
00353     if (s_buffer.size() > pos+6)
00354       s_buffer.erase(pos+6);
00355     message_out(DEBUG,"ofx_proc_file():<OFX> has been found");
00356   }
00357 
00358   s_buffer=sanitize_proprietary_tags(s_buffer);
00359   tmp_file.write(s_buffer.c_str(), s_buffer.length());
00360 
00361   tmp_file.close();
00362 
00363   char filename_openspdtd[255];
00364   char filename_dtd[255];
00365   char filename_ofx[255];
00366   strncpy(filename_openspdtd,find_dtd(ctx, OPENSPDCL_FILENAME).c_str(),255);//The opensp sgml dtd file
00367   if(libofx_context->currentFileType()==OFX){
00368     strncpy(filename_dtd,find_dtd(ctx, OFX160DTD_FILENAME).c_str(),255);//The ofx dtd file
00369   }
00370   else if(libofx_context->currentFileType()==OFC){
00371     strncpy(filename_dtd,find_dtd(ctx, OFCDTD_FILENAME).c_str(),255);//The ofc dtd file
00372   }
00373   else {
00374     message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00375   }
00376 
00377   if((string)filename_dtd!="" && (string)filename_openspdtd!=""){
00378     strncpy(filename_ofx,tmp_filename,255);//The processed ofx file
00379     filenames[0]=filename_openspdtd;
00380     filenames[1]=filename_dtd;
00381     filenames[2]=filename_ofx;
00382     if(libofx_context->currentFileType()==OFX){
00383       ofx_proc_sgml(libofx_context, 3,filenames);
00384     }
00385     else if(libofx_context->currentFileType()==OFC){
00386       ofc_proc_sgml(libofx_context, 3,filenames);
00387     }
00388     else {
00389       message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00390     }
00391     if(remove(tmp_filename)!=0){
00392       message_out(ERROR,"ofx_proc_file(): Error deleting temporary file "+string(tmp_filename));
00393     }
00394   }
00395   else {
00396     message_out(ERROR,"ofx_proc_file(): FATAL: Missing DTD, aborting");
00397   }
00398 
00399   return 0;
00400 }
00401 
00402 
00403 
00404 
00405 
00406 
00411 string sanitize_proprietary_tags(string input_string)
00412 {
00413   unsigned int i;
00414   size_t input_string_size;
00415   bool strip=false;
00416   bool tag_open=false;
00417   int tag_open_idx=0;//Are we within < > ?
00418   bool closing_tag_open=false;//Are we within </ > ?
00419   int orig_tag_open_idx=0;
00420   bool proprietary_tag=false; //Are we within a proprietary element?
00421   bool proprietary_closing_tag=false;
00422   int crop_end_idx=0;
00423   char buffer[READ_BUFFER_SIZE]="";
00424   char tagname[READ_BUFFER_SIZE]="";
00425   int tagname_idx=0;
00426   char close_tagname[READ_BUFFER_SIZE]="";
00427  
00428   for(i=0;i<READ_BUFFER_SIZE;i++){
00429     buffer[i]=0;
00430     tagname[i]=0;
00431     close_tagname[i]=0;
00432   }
00433   
00434   input_string_size=input_string.size();
00435   
00436   for(i=0;i<=input_string_size;i++){
00437     if(input_string.c_str()[i]=='<'){
00438       tag_open=true;
00439       tag_open_idx=i;
00440       if(proprietary_tag==true&&input_string.c_str()[i+1]=='/'){
00441         //We are now in a closing tag
00442         closing_tag_open=true;
00443         //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl;
00444         if(strncmp(tagname,&(input_string.c_str()[i+2]),strlen(tagname))!=0){
00445           //If it is the begining of an other tag
00446           //cout<<"DIFFERENT!"<<endl;
00447           crop_end_idx=i-1;
00448           strip=true;
00449         }
00450         else{
00451           //Otherwise, it is the start of the closing tag of the proprietary tag
00452           proprietary_closing_tag=true;
00453         }
00454       }
00455       else if(proprietary_tag==true){
00456         //It is the start of a new tag, following a proprietary tag
00457         crop_end_idx=i-1;
00458         strip=true;
00459       }
00460     }
00461     else if(input_string.c_str()[i]=='>'){
00462       tag_open=false;
00463       closing_tag_open=false;
00464       tagname[tagname_idx]=0;
00465       tagname_idx=0;
00466       if(proprietary_closing_tag==true){
00467         crop_end_idx=i;
00468         strip=true;
00469       }
00470     }
00471     else if(tag_open==true&&closing_tag_open==false){
00472       if(input_string.c_str()[i]=='.'){
00473         if(proprietary_tag!=true){
00474           orig_tag_open_idx = tag_open_idx;
00475           proprietary_tag=true;
00476         }
00477       }
00478       tagname[tagname_idx]=input_string.c_str()[i];
00479       tagname_idx++;
00480     }
00481     //cerr <<i<<endl;
00482     if(strip==true && orig_tag_open_idx < input_string.size())
00483       {
00484         input_string.copy(buffer,(crop_end_idx-orig_tag_open_idx)+1,orig_tag_open_idx);
00485         message_out(INFO,"sanitize_proprietary_tags() (end tag or new tag) removed: "+string(buffer));
00486         input_string.erase(orig_tag_open_idx,(crop_end_idx-orig_tag_open_idx)+1);
00487         i=orig_tag_open_idx-1;
00488         proprietary_tag=false;
00489         proprietary_closing_tag=false;
00490         closing_tag_open=false;
00491         tag_open=false;
00492         strip=false;
00493       }
00494 
00495   }//end for
00496   if(proprietary_tag==true && orig_tag_open_idx < input_string.size()){
00497     if(crop_end_idx==0){//no closing tag
00498       crop_end_idx=input_string.size()-1;
00499     }
00500     input_string.copy(buffer,(crop_end_idx-orig_tag_open_idx)+1,orig_tag_open_idx);
00501     message_out(INFO,"sanitize_proprietary_tags() (end of line) removed: "+string(buffer));
00502     input_string.erase(orig_tag_open_idx,(crop_end_idx-orig_tag_open_idx)+1);
00503   }
00504   return input_string;
00505 }
00506 
00507 
00508 
00514 string find_dtd(LibofxContextPtr ctx, string dtd_filename)
00515 {
00516   int i;
00517   ifstream dtd_file;
00518   string dtd_path_filename;
00519   bool dtd_found=false;
00520 
00521   dtd_path_filename=((LibofxContext*)ctx)->dtdDir();
00522   if (!dtd_path_filename.empty()) {
00523     dtd_path_filename.append(dtd_filename);
00524     dtd_file.clear();
00525     dtd_file.open(dtd_path_filename.c_str());
00526     if(dtd_file){
00527       message_out(STATUS,"find_dtd():DTD found: "+dtd_path_filename);
00528       dtd_file.close();
00529       dtd_found=true;
00530     }
00531   }
00532 
00533   if (!dtd_found) {
00534     for(i=0;i<DTD_SEARCH_PATH_NUM&&dtd_found==false;i++){
00535       dtd_path_filename=DTD_SEARCH_PATH[i];
00536       dtd_path_filename.append(dtd_filename);
00537       dtd_file.clear();
00538       dtd_file.open(dtd_path_filename.c_str());
00539       if(!dtd_file){
00540         message_out(DEBUG,"find_dtd():Unable to open the file "+dtd_path_filename);
00541       }
00542       else{
00543         message_out(STATUS,"find_dtd():DTD found: "+dtd_path_filename);
00544         dtd_file.close();
00545         dtd_found=true;
00546       }
00547     }
00548   }
00549 
00550   if(dtd_found==false){
00551     message_out(ERROR,"find_dtd():Unable to find the DTD named " + dtd_filename);
00552     dtd_path_filename="";
00553   }
00554   return dtd_path_filename;
00555 }
00556 
00557 

Generated on Mon Feb 9 21:21:59 2009 for LibOFX by  doxygen 1.5.0