LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  ofx_preproc.cpp
3  -------------------
4  copyright : (C) 2002 by Benoit Gr�oir
5  email : benoitg@coeus.ca
6 ***************************************************************************/
12 /***************************************************************************
13  * *
14  * This program is free software; you can redistribute it and/or modify *
15  * it under the terms of the GNU General Public License as published by *
16  * the Free Software Foundation; either version 2 of the License, or *
17  * (at your option) any later version. *
18  * *
19  ***************************************************************************/
20 #include "../config.h"
21 #include <iostream>
22 #include <fstream>
23 #include <cstdlib>
24 #include <stdio.h>
25 #include <sstream>
26 #include <string>
27 #include "ParserEventGeneratorKit.h"
28 #include "libofx.h"
29 #include "messages.hh"
30 #include "ofx_sgml.hh"
31 #include "ofc_sgml.hh"
32 #include "ofx_preproc.hh"
33 #include "ofx_utilities.hh"
34 #ifdef HAVE_ICONV
35 #include <iconv.h>
36 #endif
37 
38 #ifdef __WIN32__
39 # define DIRSEP "\\"
40 #else
41 # define DIRSEP "/"
42 #endif
43 
44 #ifdef __WIN32__
45 # include "win32.hh"
46 # include <windows.h> // for GetModuleFileName()
47 # undef ERROR
48 # undef DELETE
49 #endif
50 
51 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53 
54 using namespace std;
58 #ifdef MAKEFILE_DTD_PATH
59 const int DTD_SEARCH_PATH_NUM = 4;
60 #else
61 const int DTD_SEARCH_PATH_NUM = 3;
62 #endif
63 
68 {
69 #ifdef MAKEFILE_DTD_PATH
70  MAKEFILE_DTD_PATH ,
71 #endif
72  "/usr/local/share/libofx/dtd",
73  "/usr/share/libofx/dtd",
74  "~"
75 };
76 
81 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
82 {
83  LibofxContext *libofx_context;
84  bool ofx_start = false;
85  bool ofx_end = false;
86  bool file_is_xml = false;
87  bool used_iconv = false;
88  ifstream input_file;
89  ofstream tmp_file;
90  char *filenames[3];
91  char tmp_filename[256];
92  int tmp_file_fd;
93 #ifdef HAVE_ICONV
94  iconv_t conversion_descriptor;
95 #endif
96  libofx_context = (LibofxContext*)ctx;
97 
98  if (p_filename != NULL && strcmp(p_filename, "") != 0)
99  {
100  message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
101 
102  input_file.open(p_filename);
103  if (!input_file)
104  {
105  message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
106  }
107 
108  mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
109 
110  message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
111 #ifdef __WIN32__
112  tmp_file_fd = mkstemp_win32(tmp_filename);
113 #else
114  tmp_file_fd = mkstemp(tmp_filename);
115 #endif
116  if (tmp_file_fd)
117  {
118  tmp_file.open(tmp_filename);
119  if (!tmp_file)
120  {
121  message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
122  return -1;
123  }
124  }
125  else
126  {
127  message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
128  return -1;
129  }
130 
131  if (input_file && tmp_file)
132  {
133  int header_separator_idx;
134  string header_name;
135  string header_value;
136  string ofx_encoding;
137  string ofx_charset;
138  do
139  {
140  stringbuf buffer;
141  string s_buffer;
142  input_file.get(buffer, '\n');
143  //cout<< "got: \"" << buffer<<"\"\n";
144  s_buffer = buffer.str();
145 
146  // Watch out: If input_file is in eof(), any subsequent read or
147  // peek() will fail and we must exit this loop.
148  if (!input_file.eof())
149  {
150  //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
151  if (input_file.fail()) // If no characters were extracted above, the failbit is set.
152  {
153  // No characters extracted means that we've reached the newline
154  // delimiter (because we already checked for EOF). We will check
155  // for and remove that newline in the next if-clause, but must
156  // remove the failbit so that peek() will work again.
157  input_file.clear();
158  }
159 
160  // Is the next character really the newline?
161  if (input_file.peek() == '\n')
162  {
163  // Yes. Then discard that newline character from the stream
164  input_file.get();
165  }
166  }
167 
168  if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
169  {
170  message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
171  file_is_xml = true;
172  }
173 
174  int ofx_start_idx;
175  if (ofx_start == false)
176  {
177  if (
178  (libofx_context->currentFileType() == OFX &&
179  ((ofx_start_idx = s_buffer.find("<OFX>")) != string::npos ||
180  (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
181  ||
182  (libofx_context->currentFileType() == OFC &&
183  ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
184  (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
185  )
186  {
187  ofx_start = true;
188  if (file_is_xml == false)
189  {
190  s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
191  }
192  message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
193 
194  if (file_is_xml == true)
195  {
196  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
197  if (putenv(sp_charset_fixed) != 0)
198  {
199  message_out(ERROR, "ofx_proc_file(): putenv failed");
200  }
201  /* Normally the following would be "xml".
202  * Unfortunately, opensp's generic api will garble UTF-8 if this is
203  * set to xml. So we set any single byte encoding to avoid messing
204  * up UTF-8. Unfortunately this means that non-UTF-8 files will not
205  * get properly translated. We'd need to manually detect the
206  * encoding in the XML header and convert the xml with iconv like we
207  * do for SGML to work around the problem. Most unfortunate. */
208  static char sp_encoding[] = "SP_ENCODING=ms-dos";
209  if (putenv(sp_encoding) != 0)
210  {
211  message_out(ERROR, "ofx_proc_file(): putenv failed");
212  }
213  }
214  else
215  {
216  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
217  if (putenv(sp_charset_fixed) != 0)
218  {
219  message_out(ERROR, "ofx_proc_file(): putenv failed");
220  }
221  static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
222  if (putenv(sp_encoding) != 0)
223  {
224  message_out(ERROR, "ofx_proc_file(): putenv failed");
225  }
226 #ifdef HAVE_ICONV
227  string fromcode;
228  string tocode;
229  if (ofx_encoding.compare("USASCII") == 0)
230  {
231  if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
232  {
233  //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
234  fromcode = "ISO-8859-1";
235  }
236  else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
237  {
238  //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
239  fromcode = "CP1252";
240  }
241  else if (ofx_charset.compare("NONE") == 0)
242  {
243  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
244  }
245  else
246  {
247  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
248  }
249  }
250  else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
251  {
252  //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
253  fromcode = "UTF-8";
254  }
255  else
256  {
257  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
258  }
259  tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
260  message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
261  conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
262  used_iconv = true;
263 #endif
264  }
265  }
266  else
267  {
268  //We are still in the headers
269  if ((header_separator_idx = s_buffer.find(':')) != string::npos)
270  {
271  //Header processing
272  header_name.assign(s_buffer.substr(0, header_separator_idx));
273  header_value.assign(s_buffer.substr(header_separator_idx + 1));
274  while ( header_value[header_value.length() -1 ] == '\n' ||
275  header_value[header_value.length() -1 ] == '\r' )
276  header_value.erase(header_value.length() - 1);
277  message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
278  if (header_name.compare("ENCODING") == 0)
279  {
280  ofx_encoding.assign(header_value);
281  }
282  if (header_name.compare("CHARSET") == 0)
283  {
284  ofx_charset.assign(header_value);
285  }
286  }
287  }
288  }
289 
290  if (file_is_xml == true || (ofx_start == true && ofx_end == false))
291  {
292  if (ofx_start == true)
293  {
294  /* The above test won't help us if the <OFX> tag is on the same line
295  * as the xml header, but as opensp can't be used to parse it anyway
296  * this isn't a great loss for now.
297  */
298  s_buffer = sanitize_proprietary_tags(s_buffer);
299  if (s_buffer.empty())
300  continue;
301  }
302  //cout<< s_buffer<<"\n";
303  if (file_is_xml == false)
304  {
305 #ifdef HAVE_ICONV
306  size_t inbytesleft = s_buffer.size();
307  size_t outbytesleft = inbytesleft * 2 - 1;
308  char * iconv_buffer = (char*) malloc (inbytesleft * 2);
309  memset(iconv_buffer, 0, inbytesleft * 2);
310 #if defined(__sun) || defined(__NetBSD__)
311  const char * inchar = (const char *)s_buffer.c_str();
312 #else
313  char * inchar = (char *)s_buffer.c_str();
314 #endif
315  char * outchar = iconv_buffer;
316  int iconv_retval = iconv (conversion_descriptor,
317  &inchar, &inbytesleft,
318  &outchar, &outbytesleft);
319  if (iconv_retval == -1)
320  {
321  message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
322  }
323  // All validly converted bytes will be copied to the
324  // original buffer
325  s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
326  free (iconv_buffer);
327 #endif
328  }
329  //cout << s_buffer << "\n";
330  tmp_file << s_buffer << endl;
331  }
332 
333  if (ofx_start == true &&
334  (
335  (libofx_context->currentFileType() == OFX &&
336  ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
337  (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
338  || (libofx_context->currentFileType() == OFC &&
339  ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
340  (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
341  )
342  )
343  {
344  ofx_end = true;
345  message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
346  }
347 
348  }
349  while (!input_file.eof() && !input_file.bad());
350  }
351  input_file.close();
352  tmp_file.close();
353 #ifdef HAVE_ICONV
354  if (used_iconv == true)
355  {
356  iconv_close(conversion_descriptor);
357  }
358 #endif
359  char filename_openspdtd[255];
360  char filename_dtd[255];
361  char filename_ofx[255];
362  strncpy(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME).c_str(), 255); //The opensp sgml dtd file
363  if (libofx_context->currentFileType() == OFX)
364  {
365  strncpy(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME).c_str(), 255); //The ofx dtd file
366  }
367  else if (libofx_context->currentFileType() == OFC)
368  {
369  strncpy(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME).c_str(), 255); //The ofc dtd file
370  }
371  else
372  {
373  message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
374  }
375 
376  if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
377  {
378  strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
379  filenames[0] = filename_openspdtd;
380  filenames[1] = filename_dtd;
381  filenames[2] = filename_ofx;
382  if (libofx_context->currentFileType() == OFX)
383  {
384  ofx_proc_sgml(libofx_context, 3, filenames);
385  }
386  else if (libofx_context->currentFileType() == OFC)
387  {
388  ofc_proc_sgml(libofx_context, 3, filenames);
389  }
390  else
391  {
392  message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
393  }
394  if (remove(tmp_filename) != 0)
395  {
396  message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
397  }
398  }
399  else
400  {
401  message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
402  }
403  }
404  else
405  {
406  message_out(ERROR, "ofx_proc_file():No input file specified");
407  }
408  return 0;
409 }
410 
411 /* Searches input string for an opening or closing tag starting from pos_start.
412  * If found will return the tag_name and pos_start will be set to the string
413  * of the starting <, pos_end to the position after the closing '>'
414  * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
415  */
416 static string find_tag_open (string& input_string, size_t& pos_start, size_t& pos_end)
417 {
418  pos_start = input_string.find ('<', pos_start);
419 
420  if (pos_start == string::npos)
421  {
422  pos_end = string::npos;
423  return string();
424  }
425 
426  pos_end = input_string.find ('>', pos_start + 1);
427  if (pos_end != string::npos)
428  pos_end = pos_end + 1;
429  size_t tag_size = (pos_end - 1) - (pos_start + 1);
430  return input_string.substr(pos_start + 1 , tag_size);
431 }
432 
433 /* Searches input string for a closing tag matching tag_name starting at pos.
434  * If found pos will be set to the position right after of the closing '>'
435  * If no matching closing tag is found pos will be set to the start of the next
436  * opening or closing tag found.
437  */
438 static void find_tag_close (string& input_string, string& tag_name, size_t& pos)
439 {
440  size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
441 
442  if (start_idx == string::npos)
443  {
444  start_idx = pos;
445  size_t end_idx;
446  string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
447  if (!new_tag_name.empty())
448  {
449  message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
450  // find_tag_open returns the *end* of an opening tag, but in this
451  // case we want its start, so we need to rewind a bit..
452  pos = start_idx;
453  //printf("find_tag_close() returning pos after fallback: %d\n",pos);
454  }
455  else
456  {
457  pos = input_string.length();
458  }
459  }
460  else
461  {
462  pos = start_idx + tag_name.length() + 3;
463  }
464  return;
465 }
466 
467 
479 string sanitize_proprietary_tags(string input_string)
480 {
481  size_t last_known_good_pos = 0;
482  size_t open_tag_start_pos = last_known_good_pos;
483  size_t open_tag_end_pos;
484  size_t close_tag_end_pos;
485 
486  string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
487  while (!tag_name.empty())
488  {
489  // Determine whether the current tag is proprietary.
490  if ((tag_name.find('.') != string::npos) || // tag has a . in the name
491  (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
492  {
493  close_tag_end_pos = open_tag_end_pos;
494  find_tag_close (input_string, tag_name, close_tag_end_pos);
495  size_t tag_size = close_tag_end_pos - open_tag_start_pos;
496  string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
497  message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
498  input_string.erase(open_tag_start_pos, tag_size);
499  last_known_good_pos = open_tag_start_pos;
500  }
501  else
502  {
503  last_known_good_pos = open_tag_end_pos;
504  }
505  tag_name.clear();
506  open_tag_start_pos = last_known_good_pos;
507  if (last_known_good_pos != string::npos)
508  tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
509  }
510  return input_string;
511 }
512 
513 
514 #ifdef __WIN32__
515 static std::string get_dtd_installation_directory()
516 {
517  // Partial implementation of
518  // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
519  char ch_fn[MAX_PATH], *p;
520  std::string str_fn;
521 
522  if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
523 
524  if ((p = strrchr(ch_fn, '\\')) != NULL)
525  * p = '\0';
526 
527  p = strrchr(ch_fn, '\\');
528  if (p && (_stricmp(p + 1, "bin") == 0 ||
529  _stricmp(p + 1, "lib") == 0))
530  *p = '\0';
531 
532  str_fn = ch_fn;
533  str_fn += "\\share\\libofx\\dtd";
534 
535  return str_fn;
536 }
537 #endif
538 
539 
552 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
553 {
554  string dtd_path_filename;
555  char *env_dtd_path;
556 
557  dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
558  if (!dtd_path_filename.empty())
559  {
560  dtd_path_filename.append(dtd_filename);
561  ifstream dtd_file(dtd_path_filename.c_str());
562  if (dtd_file)
563  {
564  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
565  return dtd_path_filename;
566  }
567  }
568 
569 #ifdef __WIN32__
570  dtd_path_filename = get_dtd_installation_directory();
571  if (!dtd_path_filename.empty())
572  {
573  dtd_path_filename.append(DIRSEP);
574  dtd_path_filename.append(dtd_filename);
575  ifstream dtd_file(dtd_path_filename.c_str());
576  if (dtd_file)
577  {
578  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
579  return dtd_path_filename;
580  }
581  }
582 #endif
583  /* Search in environement variable OFX_DTD_PATH */
584  env_dtd_path = getenv("OFX_DTD_PATH");
585  if (env_dtd_path)
586  {
587  dtd_path_filename.append(env_dtd_path);
588  dtd_path_filename.append(DIRSEP);
589  dtd_path_filename.append(dtd_filename);
590  ifstream dtd_file(dtd_path_filename.c_str());
591  if (!dtd_file)
592  {
593  message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
594  }
595  else
596  {
597  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
598  return dtd_path_filename;
599  }
600  }
601 
602  for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
603  {
604  dtd_path_filename = DTD_SEARCH_PATH[i];
605  dtd_path_filename.append(DIRSEP);
606  dtd_path_filename.append(dtd_filename);
607  ifstream dtd_file(dtd_path_filename.c_str());
608  if (!dtd_file)
609  {
610  message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
611  }
612  else
613  {
614  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
615  return dtd_path_filename;
616  }
617  }
618 
619  /* Last resort, look in source tree relative path (useful for development) */
620  dtd_path_filename = "";
621  dtd_path_filename.append("..");
622  dtd_path_filename.append(DIRSEP);
623  dtd_path_filename.append("dtd");
624  dtd_path_filename.append(DIRSEP);
625  dtd_path_filename.append(dtd_filename);
626  ifstream dtd_file(dtd_path_filename.c_str());
627  if (!dtd_file)
628  {
629  message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
630  }
631  else
632  {
633  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
634  return dtd_path_filename;
635  }
636 
637 
638  message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
639  return "";
640 }
Definition: messages.hh:32
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Definition: ofx_preproc.cpp:81
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
Definition: ofx_preproc.cpp:61
int message_out(OfxMsgType error_type, const string message)
Message output function.
Definition: messages.cpp:61
OFX/SGML parsing functionnality.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
Definition: ofx_preproc.cpp:67
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofc_sgml.cpp:353
Various simple functions for type conversion & al.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofx_sgml.cpp:372
string sanitize_proprietary_tags(string input_string)
Removes proprietary tags and comments.
OFX/SGML parsing functionnality.
Message IO functionality.
Preprocessing of the OFX files before parsing.
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.