bes  Updated for version 3.20.8
merge_dmrpp.cc
1 #include <iostream>
2 #include <fstream>
3 #include <sstream>
4 #include <string>
5 #include <vector>
6 #include <set>
7 #include <algorithm>
8 
9 using namespace std;
10 
11 // The following block of functions retrieve the "missing" variable type, variable name and data value information.
12 bool obtain_var_info(const string & miss_dmrpp_info,vector<string> & var_types, vector<string>&var_names,vector<string>&chunk_info_list,bool & is_chunk_mark1);
13 bool find_var_name(const string &str,size_t &str_pos,size_t &var_name_pos_start,size_t &var_name_pos_end);
14 bool find_end_var_block(const string&str, const string&var_type, const size_t &str_pos, size_t &var_end_pos);
15 bool find_chunk_info(const string &str,const size_t&str_pos,size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,const size_t&var_end_pos,bool & is_mark1);
16 
17 // The following block of functions add the file address(mostly the absolute path of the HDF5 file that stores the data value) to the chunk block.
18 bool add_faddr_chunk_info(const string& miss_dmrpp_info,vector<string>&chunk_info_list,bool is_dmrpp_mark1,const string faddr_source = "");
19 bool add_faddr_contig_line(string &chunk_info,const string &file_addr);
20 bool add_faddr_chunk_comp_lines(string & chunk_info,const string &file_addr);
21 
22 // The following block of functions merge the "missing" variable data value information to the original dmrpp file.
23 bool add_missing_info_to_file(const string &fname2,const vector<string> &var_types,const vector<string> &var_names,const vector<string> &chunk_info_list);
24 void gen_block(const vector<string>&var_type_list,const vector<string>&var_name_list,vector<string>&block_begin,vector<string>&block_end);
25 bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t> & block_pos_start);
26 void obtain_bindex_in_modified_string(const vector<size_t>& block_pos_start, vector<int>& block_index);
27 bool split_string(const string & str, vector<string> &str_vec,const vector<string> &block_begin, const vector<string> &block_end,vector<int> &block_index);
28 bool convert_dmrppstr_to_vec(const string &dmrpp_str,vector<string> &dmrpp_str_vec,const vector<string> &var_types,const vector<string> &var_names,vector<int> & block_index);
29 void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,const vector<int>&block_index);
30 void write_vec_to_file(const string &fname,const vector<string> &dmrpp_str_vec);
31 
32 // The following two functions are helper functions
33 void file_to_string(const string &filename, string & out);
34 void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec);
35 
36 
37 int main (int argc,char**argv)
38 {
39  string dmrpp_line;
40  vector<string>var_types;
41  vector<string>var_names;
42  vector<string>chunk_info_list;
43 
44  bool add_dmrpp_info = false;
45  bool is_chunk_mark1 = true;
46 
47  string missing_dmrpp_str;
48 
49  if(argc != 5) {
50  cout<<"Please provide four arguments: "<< endl;
51  cout<<" The first is the dmrpp file that contains the missing variable value information. "<<endl;
52  cout<<" The second is the original dmrpp file. "<<endl;
53  cout<<" An third one is the href to the missing variables HDF5 file. "<<endl;
54  cout<<" The fourth one is the text file that includes the missing variable information. "<<endl;
55  return 0;
56  }
57 
58  // Obtain the dmrpp file name that contains the missing variable value.
59  string fname(argv[1]);
60 
61  // Read the "missing dmrpp file" to a string
62  file_to_string(fname,missing_dmrpp_str);
63 
64  // Obtain the missing chunk information from the dmrpp file.
65  add_dmrpp_info = obtain_var_info(missing_dmrpp_str,var_types,var_names,chunk_info_list,is_chunk_mark1);
66 
67  // Just output a warning that there is no chunk info, in the supplemental dmrpp file.
68  if(false == add_dmrpp_info) {
69  cout<<"Cannot find corresponding chunk info. from the supplemental dmrpp file."<<endl;
70  cout<<"You may need to check if there is any variable in the dmrpp file. "<<endl;
71  cout<<"The dmrpp file is "<<fname <<endl;
72  }
73 
74  if(var_types.size() !=var_names.size() || var_names.size() != chunk_info_list.size()) {
75  cout <<"Var type, var name and chunk_info must have the same number of sizes. "<<endl;
76  cout <<"The dmrpp file is "<<fname <<endl;
77  return 0;
78  }
79 
80 #if 0
81 
82  for (size_t i =0; i<var_names.size();i++) {
83 //cout<<"var type["<<i<<"]"<< var_types[i]<<endl;
84 //cout<<"var name["<<i<<"]"<< var_names[i]<<endl;
85 //cout<<"chunk_info_list["<<i<<"]"<< endl;
86 
87  }
88 #endif
89 
90  // We need to erase those variables that are not really missing but are added by the generation program
91  string mvar_fname(argv[4]);
92  string missing_vname_str;
93 
94  // Read the missing variable names to a string and tokenize the string to a vector of string.
95  file_to_string(mvar_fname,missing_vname_str);
96 
97  vector<string> missing_vname_list;
98  char delim=',';
99  string_tokenize(missing_vname_str,delim,missing_vname_list);
100 
101 #if 0
102  for(size_t i = 0;i<missing_vname_list.size();i++)
103  cout <<"missing_vname_list["<<i<<"]"<<missing_vname_list[i]<<endl;
104 #endif
105 
106  // Remove the additional variables added by the filenetCDF-4 module.
107  vector<string>new_var_types;
108  vector<string>new_var_names;
109  vector<string>new_chunk_info_list;
110 
111  for (size_t i =0; i<var_names.size();i++) {
112  for(size_t j = 0; j<missing_vname_list.size();j++) {
113  if(var_names[i] == missing_vname_list[j]) {
114  new_var_names.push_back(var_names[i]);
115  new_var_types.push_back(var_types[i]);
116  new_chunk_info_list.push_back(chunk_info_list[i]);
117  break;
118  }
119  }
120  }
121 
122  // Add file address to each chunk. Mostly the file address is the absolute path of the HDF5 files.
123  string fadd_source(argv[3]);
124  add_faddr_chunk_info(missing_dmrpp_str,new_chunk_info_list,is_chunk_mark1,fadd_source);
125 
126 #if 0
127 for (size_t i =0; i<new_var_types.size();i++) {
128 cout<<"new chunk_info_list["<<i<<"]"<< endl;
129 cout<<new_chunk_info_list[i]<<endl;
130 }
131 #endif
132 
133  //string dmrpp_str;
134  string fname2(argv[2]);
135 
136  // Add the missing chunk info to the original dmrpp file.
137  bool well_formed = add_missing_info_to_file(fname2,new_var_types,new_var_names,new_chunk_info_list);
138 
139  if(false == well_formed) {
140  cout <<"The dmrpp file to be modified is either not well-formed or contains nested variable blocks that cannot be supported by this routine" <<endl;
141  cout <<"The dmrpp file is "<<fname2<<endl;
142 
143  }
144 
145  return 0;
146 
147 }
148 
149 // Obtain the var info from the supplemental(missing) dmrpp file. The variable types we checked are limited to DAP2 data types plus 64-bit integers.
150 bool obtain_var_info(const string & miss_dmrpp_info,vector<string> & var_types, vector<string>&var_names,vector<string>&chunk_info_list,bool & is_chunk_mark1) {
151 
152  bool ret = false;
153  vector<string> var_type_list;
154  var_type_list.push_back("Float32");
155  var_type_list.push_back("Int32");
156  var_type_list.push_back("Float64");
157  var_type_list.push_back("Byte");
158  var_type_list.push_back("Int16");
159  var_type_list.push_back("UInt16");
160  var_type_list.push_back("String");
161  var_type_list.push_back("UInt32");
162  var_type_list.push_back("Int8");
163  var_type_list.push_back("Int64");
164  var_type_list.push_back("UInt64");
165  var_type_list.push_back("UInt8");
166  var_type_list.push_back("Char");
167 
168  size_t var_type_pos_start =0;
169  size_t var_name_pos_start = 0;
170  size_t var_name_pos_end = 0;
171  size_t chunk_pos_start = 0;
172  size_t chunk_pos_end = 0;
173  size_t var_end_pos= 0;
174  size_t str_pos = 0;
175 
176 
177  if(miss_dmrpp_info.empty())
178  return ret;
179 
180  size_t str_last_char_pos = miss_dmrpp_info.size()-1;
181  bool well_formed = true;
182 
183  // Go through the whole missing dmrpp string
184  while (str_pos <=str_last_char_pos && well_formed) {
185 
186  size_t i = 0;
187  string var_sign;
188  string temp_var_sign;
189  size_t temp_var_type_pos_start=string::npos;
190  int var_type_index = -1;
191 
192  // Go through the var_type_list to obtain the var data type
193  // We need to find the index in the var_type_list to
194  // obtain the correct var datatype.
195  while(i <var_type_list.size()) {
196  var_sign = "<"+var_type_list[i]+" name=\"";
197  var_type_pos_start = miss_dmrpp_info.find(var_sign,str_pos);
198  if(var_type_pos_start ==string::npos) {
199  i++;
200  continue;
201  }
202  else {
203  // We want to make sure we don't skip any vars.
204  if(temp_var_type_pos_start>var_type_pos_start){
205  temp_var_type_pos_start = var_type_pos_start;
206  var_type_index = i;
207  temp_var_sign = var_sign;
208  }
209  i++;
210  }
211 
212  }
213 
214  // Ensure all variables are scanned.
215  if(temp_var_type_pos_start !=string::npos) {
216  var_type_pos_start = temp_var_type_pos_start;
217  var_sign = temp_var_sign;
218 
219  }
220 
221  // This line will ignore datatypes that are not in the var_type_list
222  if(var_type_pos_start == string::npos) {
223  str_pos = string::npos;
224  continue;
225  }
226  else
227  str_pos = var_type_pos_start+var_sign.size();
228 
229  // Now we can retrieve var name, var type and the corresponding chunk info
230  // Sanity check is also applied.
231  if(false == find_var_name(miss_dmrpp_info,str_pos,var_name_pos_start,var_name_pos_end))
232  well_formed = false;
233  else if(false == find_end_var_block(miss_dmrpp_info,var_type_list[var_type_index],str_pos,var_end_pos))
234  well_formed = false;
235  else if(false == find_chunk_info(miss_dmrpp_info,str_pos,chunk_pos_start,chunk_pos_end,var_end_pos,is_chunk_mark1))
236  well_formed = false;
237  else {
238  // Move the string search pos to the next block
239  str_pos = var_end_pos+1;
240  // Obtain var type, var name and chunk info. and save them to vectors.
241  var_types.push_back(var_type_list[var_type_index]);
242  var_names.push_back(miss_dmrpp_info.substr(var_name_pos_start,var_name_pos_end-var_name_pos_start));
243  string temp_chunk_info = miss_dmrpp_info.substr(chunk_pos_start,chunk_pos_end-chunk_pos_start);
244  if(true == is_chunk_mark1)
245  temp_chunk_info +="</dmrpp:chunks>";
246  else
247  temp_chunk_info +="/>";
248  chunk_info_list.push_back(temp_chunk_info);
249  }
250 
251  }
252  return well_formed;
253 
254 }
255 
256 // Find var name in the supplemental dmrpp file.
257 // var name block must end with " such as name="temperature"
258 bool find_var_name(const string &str,size_t &str_pos,size_t &var_name_pos_start,size_t &var_name_pos_end) {
259 
260  bool ret = true;
261  var_name_pos_start = str_pos;
262  var_name_pos_end = str.find("\"",str_pos);
263  if(var_name_pos_end == string::npos)
264  ret = false;
265  else
266  str_pos = var_name_pos_end;
267 #if 0
268 if(ret==false)
269 cout<<"cannot find var name"<<endl;
270 #endif
271 
272  return ret;
273 }
274 
275 // The end var block must be something like </Float32>
276 bool find_end_var_block(const string&str, const string&var_type, const size_t &str_pos, size_t &var_end_pos) {
277 
278  string end_var = "</" + var_type + '>';
279  var_end_pos = str.find(end_var,str_pos);
280 #if 0
281 if(var_end_pos==string::npos)
282 cout<<"cannot find end var block"<<endl;
283 #endif
284  return !(var_end_pos==string::npos);
285 
286 }
287 
288 // The chunk info must be confined by either <dmrpp::chunks> and </dmrpp::chunks> or <dmrpp:chunk> and />.
289 bool find_chunk_info(const string &str,const size_t&str_pos,size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,const size_t&var_end_pos,bool & is_mark1){
290 
291  bool ret = true;
292  string chunk_start_mark1 = "<dmrpp:chunks";
293  string chunk_end_mark1 = "</dmrpp:chunks>";
294  string chunk_start_mark2 = "<dmrpp:chunk ";
295  string chunk_end_mark2 = "/>";
296  char wspace=' ';
297 
298 #if 0
299 cout<<"str_pos is "<<str_pos <<endl;
300 cout<<"var_end_pos is "<<var_end_pos <<endl;
301 cout<<"substr is "<<str.substr(str_pos,var_end_pos-str_pos)<<endl;
302 #endif
303  chunk_info_pos_start = str.find(chunk_start_mark1,str_pos);
304 
305  if(string::npos == chunk_info_pos_start) {
306 
307  chunk_info_pos_start = str.find(chunk_start_mark2,str_pos);
308  if(string::npos != chunk_info_pos_start)
309  chunk_info_pos_end =str.find(chunk_end_mark2,str_pos);
310 
311  //This line is used to find the starting point of <dmrpp:chunk,
312  //The character ahead of "<dmrpp::chunk" is always a ' ' (space)
313  chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1)+1;
314  is_mark1 = false;
315  }
316  else {
317 
318  chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1)+1;
319  chunk_info_pos_end = str.find(chunk_end_mark1,str_pos);
320  is_mark1 = true;
321  //chunk_info_pos_end = str.find(chunk_end_mark1.c_str(),str_pos,var_end_pos-str_pos);
322  }
323  if(string::npos == chunk_info_pos_start || string::npos== chunk_info_pos_end)
324  ret = false;
325  else if(var_end_pos <=chunk_info_pos_end)
326  ret = false;
327 #if 0
328 if(ret==false)
329 cout<<"cannot find_chunk_info "<<endl;
330 #endif
331  return ret;
332 }
333 
334 // We need to add the supplemental file path to the chunk info.
335 // The file name usually starts with "name= ..." and the path usually starts with dmrpp:href="
336 bool add_faddr_chunk_info(const string &str,vector<string>& chunk_info,bool is_dmrpp_mark1, const string faddr_source) {
337 
338  bool well_formed= true;
339  if(chunk_info.size()==0)
340  return true;
341  string addr_mark = "dmrpp:href=\"";
342 
343  // The missing DMRPP file can have file address specified along with chunk info.
344  // But we assume if they do this for one chunk, they should do this for all chunks.
345  // If this is the case, no need to find address.
346  if(chunk_info[0].find(addr_mark)!=string::npos)
347  return true;
348 
349  // retrieve name and reference
350  string hdf5_fname;
351  string hdf5_faddr;
352  string name_mark = " name=\"";
353  string end_delim1 ="\"";
354 
355  // We must find a valid hdf5 file name.
356  size_t hdf5_fname_start_pos = str.find(name_mark);
357  if(hdf5_fname_start_pos == string::npos)
358  well_formed = false;
359  size_t hdf5_fname_end_pos = str.find(end_delim1,hdf5_fname_start_pos+name_mark.size());
360  if(hdf5_fname_end_pos == string::npos)
361  well_formed = false;
362  hdf5_fname = str.substr(hdf5_fname_start_pos+name_mark.size(),hdf5_fname_end_pos-hdf5_fname_start_pos-name_mark.size());
363  if(hdf5_fname=="")
364  well_formed = false;
365 
366  // We also must find a valid file location .
367  size_t hdf5_faddr_start_pos = str.find(addr_mark);
368  if(hdf5_faddr_start_pos != string::npos) {
369  size_t hdf5_faddr_end_pos = str.find(end_delim1,hdf5_faddr_start_pos+addr_mark.size());
370  if(hdf5_faddr_end_pos == string::npos)
371  well_formed = false;
372  hdf5_faddr = str.substr(hdf5_faddr_start_pos+addr_mark.size(),hdf5_faddr_end_pos-hdf5_faddr_start_pos-addr_mark.size());
373  }
374 
375  // The string for use in each missing_variable <chunk href:"value" >
376  hdf5_faddr = " href=\"" + faddr_source + end_delim1;
377 
378  /*if (hdf5_faddr.rfind(hdf5_fname) == string::npos) {
379  //trim hdf5 file address.
380  hdf5_faddr = " href=\"" +hdf5_faddr+'/'+hdf5_fname+end_delim1;
381  }
382  else {
383  hdf5_faddr = " href=\"" +hdf5_faddr+end_delim1;
384  }*/
385 
386 //cout<<"hdf5_faddr is "<<hdf5_faddr <<endl;
387 
388  for (size_t i = 0;i<chunk_info.size();i++) {
389 
390  //If is_dmrpp_mark1 is true,
391  //add hdf5_faddr to each chunk line(The chunk line should have offset==)
392  //However, the variable may also use the contiguous storage.
393  //That chunk line marks with (nbyte==). Essentially it is not a chunk but
394  //the dmrpp still starts with the dmrpp:chunk.
395  if(true == is_dmrpp_mark1)
396  add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
397  else
398  add_faddr_contig_line(chunk_info[i],hdf5_faddr);
399 
400  }
401  return well_formed;
402 
403 }
404 
405 // Add chunk address when HDF5 chunking is used.
406 bool add_faddr_chunk_comp_lines(string & chunk_info,const string &file_addr) {
407 
408  string chunk_line_mark = "<dmrpp:chunk offset=";
409  string chunk_line_end_mark = "/>";
410  string chunk_stop_mark = "</dmrpp:chunks>";
411  size_t str_pos = 0;
412  size_t temp_pos = 0;
413  size_t chunk_line_end_pos = 0;
414  bool loop_continue = true;
415  string temp_str;
416  bool well_formed = true;
417  bool find_chunk_line = false;
418 
419  // While loop from <dmrpp::chunks, until /dmrpp:chunks>
420  while(true == loop_continue) {
421  temp_pos = chunk_info.find(chunk_line_mark,str_pos);
422  if(temp_pos != string::npos) {
423  chunk_line_end_pos = chunk_info.find(chunk_line_end_mark,temp_pos);
424  if(chunk_line_end_pos != string::npos) {
425  find_chunk_line = true;
426  temp_str += chunk_info.substr(str_pos,chunk_line_end_pos-str_pos);
427  temp_str += file_addr;
428  str_pos = chunk_line_end_pos;
429  }
430  else {// Each chunk offset line must end with "/>"
431  loop_continue = false;
432  well_formed = false;
433  }
434  }
435  else {// We will go to the last line </dmrpp:chunks>
436  temp_pos = chunk_info.find(chunk_stop_mark,str_pos);
437  loop_continue = false;
438  //Add the last part of the chunk info. Note: a space between
439  //.h5" and "/>"
440  if(temp_pos!=string::npos)
441  temp_str += ' '+chunk_info.substr(str_pos);
442  else
443  well_formed = false;
444  }
445  }
446  if(true == find_chunk_line)
447  chunk_info = temp_str;
448  else
449  well_formed = false;
450  return well_formed;
451 
452 }
453 
454 // Add the file address with the contiguous storage.
455 bool add_faddr_contig_line(string &chunk_info,const string &file_addr) {
456 
457  bool well_formed = true;
458  string chunk_line_start_mark ="<dmrpp::chunk nBytes=";
459  string chunk_line_end_mark = "/>";
460  string temp_str;
461 
462  // Just find the line and change it,this should always be the first line.
463  //May add a check to see if the start position is always 0.
464  size_t chunk_line_end_pos = chunk_info.find(chunk_line_end_mark);
465  if(string::npos == chunk_line_end_pos)
466  well_formed = false;
467  else {
468  temp_str = chunk_info.substr(0,chunk_line_end_pos);
469  temp_str +=file_addr;
470  temp_str +=' ' +chunk_info.substr(chunk_line_end_pos);
471  chunk_info = temp_str;
472  }
473  return well_formed;
474 }
475 
476 // Add the missing info to the original dmrpp file.
477 bool add_missing_info_to_file(const string &fname,const vector<string> &var_types,const vector<string> &var_names,const vector<string> &chunk_info_list) {
478 
479  bool well_formed = true;
480  string dmrpp_str;
481 
482  // The original dmrpp file to string
483  file_to_string(fname,dmrpp_str);
484  vector<string>dmrpp_str_vec;
485  vector <int> block_index;
486 
487  // Convert the original DMRPP string to vector according to var_types and var_names.
488  // We need to remember the block index of the missing variables
489  // since the missing variable order in the supplemental dmrpp
490  // may be different than the original one..
491  well_formed = convert_dmrppstr_to_vec(dmrpp_str,dmrpp_str_vec,var_types,var_names,block_index);
492 
493  // Release the memory of dmpstr. For a >10MB dmrpp file, this is not a small value.
494  string().swap(dmrpp_str);
495 
496  // adding the missing chunk info to the dmrpp vector and then write back to the file.
497  if(true == well_formed) {
498  add_missing_info_to_vec(dmrpp_str_vec,chunk_info_list,block_index);
499  write_vec_to_file(fname,dmrpp_str_vec);
500  }
501  return well_formed;
502 
503 }
504 
505 // Convert the original dmrpp to vectors according to the *missing* variables.
506 // Here we should NOT tokenize the orginal dmrpp according to every variable in it.
507 // We only care about feeding those variables that miss the value information.
508 bool convert_dmrppstr_to_vec(const string &dmrpp_str,vector<string> &dmrpp_str_vec,const vector<string> &var_types,const vector<string> &var_names,vector<int>&block_index){
509 
510  vector<string>block_begin;
511  block_begin.resize(var_types.size());
512  vector<string>block_end;
513  block_end.resize(var_types.size());
514  gen_block(var_types,var_names,block_begin,block_end);
515 
516 #if 0
517 for(size_t i =0; i<block_begin.size();i++)
518 {
519 cout<<"block_begin["<<i<<"]= "<<block_begin[i]<<endl;
520 cout<<"block_end["<<i<<"]= "<<block_end[i]<<endl;
521 
522 }
523 #endif
524 
525  bool well_formed = split_string(dmrpp_str,dmrpp_str_vec,block_begin,block_end,block_index);
526  return well_formed;
527 
528 }
529 
530 // Add missing information to vector according to the right block_index
531 void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,const vector<int> &block_index) {
532  string temp_str;
533  char insert_mark = '>';
534  for (size_t i = 0; i<block_index.size();i++) {
535  //cout<<"["<<2*i+1 <<"]= "<<dmrpp_str_vec[2*i+1]<<endl;
536  // The vector has to include the beginning and ending block.
537  // An example:
538  // The original string: Moses gre up i Egypt.
539  // The missing information is w in 'gre' and n in 'i'.
540  // So we have 2 missing blocks: grew and in.
541  // The original string should be divided into 5 to patch the
542  // missing characters. "Moses ","gre"," up ","i"," Egypt.".
543  // The final string then can be "Moses grew up in Egypt."
544  temp_str = dmrpp_str_vec[2*i+1];
545  size_t insert_pos = temp_str.find_last_of(insert_mark);
546  insert_pos = temp_str.find_last_of(insert_mark,insert_pos-1);
547 
548  // The block_index[i] will ensure the right chunk info.
549  string temp_str2 = '\n'+chunk_info_list[block_index[i]];
550  temp_str.insert(insert_pos+1,temp_str2);
551 #if 0
552  //cout<<"chunk_list["<<block_index[i]<<"]= "<<chunk_info_list[block_index[i]]<<endl;
553  //cout<<"temp_str is "<<temp_str <<endl;
554 #endif
555  dmrpp_str_vec[2*i+1] = temp_str;
556  }
557 
558  return;
559 
560 }
561 
562 // Used in the final step: to generate the final DMRPP file since
563 // the dmrpp is relatively small, rewriting is still the fast way.
564 void write_vec_to_file(const string &fname,const vector<string> &dmrpp_str_vec) {
565 
566  string str_to_file;
567  for (size_t i =0;i<dmrpp_str_vec.size();i++)
568  str_to_file +=dmrpp_str_vec[i];
569  //str_to_file +=dmrpp_str_vec[i]+'\n';
570  ofstream outFile;
571  outFile.open(fname.c_str());
572  outFile<<str_to_file;
573  outFile.close();
574 
575 }
576 
577 // Obtain the beginning and the ending information of the block information.
578 void gen_block(const vector<string>&var_type_list,const vector<string>&var_name_list,vector<string>&block_begin,vector<string>&block_end) {
579 
580  for (size_t i =0; i<var_type_list.size();i++) {
581  block_begin[i] = '<' +var_type_list[i] +' '+"name=\""+var_name_list[i]+"\">";
582  block_end[i] = "</" + var_type_list[i] + '>';
583  }
584 }
585 
586 // Split the string into different blocks.
587 bool split_string(const string & str, vector<string> &str_vec,const vector<string> &block_begin, const vector<string>&block_end,vector<int>&block_index) {
588 
589  bool well_formed = true;
590  vector<size_t> block_begin_pos;
591  vector<size_t> block_end_pos;
592  block_begin_pos.resize(block_begin.size());
593  block_end_pos.resize(block_end.size());
594 
595  // Note:
596  // 1) We just want to split the string according to the variables that miss values.
597  // 2) block_begin_pos in the orginal dmrpp file may NOT be sorted.
598  // However, when we read back the string vector, we want to read from beginnng to the end.
599  // So we need to remember the index of each <var block> of the supplemental dmrpp file
600  // in the original dmrpp file so that the correct chunk info can be given to the var block that misses the values.
601  for(size_t i = 0; i<block_begin.size();i++) {
602  block_begin_pos[i] = str.find(block_begin[i]);
603  block_end_pos[i] = str.find(block_end[i],block_begin_pos[i])+(block_end[i].size());
604  }
605 
606  obtain_bindex_in_modified_string(block_begin_pos,block_index);
607 
608 #if 0
609 for(size_t i = 0; i<block_index.size();i++)
610 cout<<"block_index["<<i<<"] is: "<<block_index[i] <<endl;
611 #endif
612  vector<size_t>block_pos;
613  block_pos.resize(2*block_begin_pos.size());
614  for (size_t i = 0; i<block_begin.size();i++) {
615  block_pos[2*i] = block_begin_pos[i];
616  block_pos[2*i+1] = block_end_pos[i];
617  }
618 
619  // This will ensure the string vector is kept from beginning to the end.
620  sort(block_pos.begin(),block_pos.end());
621 
622  // Use a set: resume a different set, compare with the previous one. set_difference
623  // This will ensure that each <var block> doesn't overlap with others.
624  // It is a sanity check.
625  well_formed = check_overlap_intervals(block_pos,block_begin_pos);
626 
627  // We need to consider the starting and the ending of the string
628  // So the string vector size is block_size + 1.
629  // Examples:
630  // string: Moses grew up in Egypt. It has four space intervals but five substrings.
631  if(true == well_formed) {
632  size_t str_block_pos = 0;
633  str_vec.resize(block_pos.size()+1);
634  for (size_t i =0; i<block_pos.size(); i++) {
635  str_vec[i] = str.substr(str_block_pos,block_pos[i]-str_block_pos);
636  str_block_pos = block_pos[i];
637  }
638  str_vec[block_pos.size()] = str.substr(str_block_pos);
639 
640 #if 0
641 for(size_t i = 0; i <str_vec.size();i++)
642  cout<<"str_vec["<<i<<"] is: "<<str_vec[i] <<endl;
643 #endif
644  }
645  return well_formed;
646 
647 }
648 
649 // Check if there are overlaps between any two var blocks.
650 // Note: If there are no overlaps between var blocks, the sorted block-start's position set should be
651 // the same as the unsorted one. This will take O(nlogn) rather than O(n*n) time.
652 bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t>&block_pos_start){
653 
654  // No overlapping, return true.
655  set<size_t>sort_start_pos;
656  set<size_t>start_pos;
657  for (size_t i = 0; i<block_pos_start.size();i++) {
658  sort_start_pos.insert(sort_block_pos[2*i]);
659  start_pos.insert(block_pos_start[i]);
660  }
661  return (sort_start_pos == start_pos);
662 
663 }
664 
665 // Obtain the block index of the var block in the supplemental dmrpp file.
666 // We need to remember the index of a var block in the supplemental dmrpp file to correctly match
667 // the same var block in the original dmrpp file.
668 // An example:
669 // ex.h5.dmrpp has the variables as the order: ex1,ex2,lon,ex3,fakedim,lat.
670 // It misses the values of lon,fakedime,lat.
671 // In the supplemental dmrpp that has the value information, the variable order is lat,lon,fakedim.
672 // In order to correctly provide the value info of lon,fakedim and lat without explicitly searching
673 // the string. I decide to remember the vector index of variables in the supplemental dmrpp file.
674 // In this case, the index of lat is 0, lon is 1 and fakedim is 2. While adding value info of the
675 // missing variables in the ex.h5.dmrpp, I can just use the index to identify which chunk info I
676 // should use to fill in.
677 //
678 void obtain_bindex_in_modified_string(const vector<size_t>& block_pos_start, vector<int>& block_index) {
679 
680  vector<pair<size_t,int> > pos_index;
681  for (size_t i = 0; i <block_pos_start.size();i++)
682  pos_index.push_back(make_pair(block_pos_start[i],i));
683 
684  // The pos_index will be sorted according to the first element,block_pos_start
685  sort(pos_index.begin(),pos_index.end());
686 
687  for (size_t i = 0; i <block_pos_start.size();i++)
688  block_index.push_back(pos_index[i].second);
689  return;
690 }
691 
692 // Help function: read the file content to a string.
693 void file_to_string(const string &filename, string &out_str) {
694 
695  ifstream inFile;
696  inFile.open(filename.c_str());
697 
698  stringstream strStream;
699  strStream << inFile.rdbuf();
700 
701  // Save the content to the string
702  out_str = strStream.str();
703  inFile.close();
704 
705 }
706 
707 //tokenize the string to a vector of string according the delim.
708 void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec) {
709  stringstream ss_str(in_str);
710  string temp_str;
711  while (getline(ss_str,temp_str,delim)) {
712  out_vec.push_back(temp_str);
713  }
714 }
715