TECA
The Toolkit for Extreme Climate Analysis
teca_table_reader.h
1 #ifndef teca_table_reader_h
2 #define teca_table_reader_h
3 
4 #include "teca_algorithm.h"
5 #include "teca_metadata.h"
6 #include "teca_shared_object.h"
7 #include "teca_table.h"
8 
9 #include <vector>
10 #include <string>
11 #include <mutex>
12 
13 
14 TECA_SHARED_OBJECT_FORWARD_DECL(teca_table_reader)
15 
16 /// a reader for data stored in binary table format
17 /**
18  * A reader for data stored in CSV or binary table format. By default the reader
19  * reads and returns the entire table on rank 0. The reader can partition the
20  * data across an "index column". The index column assigns a unique id to rows
21  * that should be returned together. The reader reports the number of unique ids
22  * to the pipeline which can then be requested by the pipeline during parallel or
23  * sequential execution.
24  *
25  * output:
26  * generates a table containing the data read from the file.
27  *
28  *
29  * ### TECA CSV format specification
30  *
31  * #### Comment lines
32  *
33  * a '#' character at the start of a line marks it as a comment. The version of
34  * the CSV specification as well as the version of TECA used to write the table
35  * will be stored in comment lines. Comment lines are currently skipped when
36  * reading the table.
37  *
38  * #### Column definitions
39  *
40  * the first row stores the names and data types of the columns. Column names are
41  * strings and delimited by double quotes. A column's data type is
42  * encoded in the name using (N) where N is an integer type code defined by
43  * teca_variant_array and parentheses delimit the type code. The type code
44  * sequence is stripped from the name when the file is read.
45  *
46  * | C type | code |
47  * | ------ | ---- |
48  * | char | 1 |
49  * | unsigned char | 2 |
50  * | int | 3 |
51  * | unsigned int | 4 |
52  * | short int | 5 |
53  * | short unsigned int | 6 |
54  * | long | 7 |
55  * | unsigned long | 8 |
56  * | long long | 9 |
57  * | unsigned long long | 10 |
58  * | float | 11 |
59  * | double | 12 |
60  * | std::string | 13 |
61  *
62  * The number of column definitions found determines the number of columns in the
63  * table when reading.
64  *
65  * #### Column data
66  *
67  * Data is organized row by row with an entry for each column. Entries are
68  * separated by commas ','. Error's will occur when the number of column
69  * definitions don't match the number of data entries per row.
70  *
71  * #### String data
72  *
73  * Strings are delimited by double quotations. Double quotes and commas in strings
74  * may be escaped by a backslash.
75  *
76  * #### Numeric data
77  *
78  * The type code provided in the column definition tells the type of number.
79  * These codes are defined in teca_variant_array.
80  *
81  * floating point types are written with format and precision such that they
82  * may be read without introducing rounding error.
83  */
85 {
86 public:
87  TECA_ALGORITHM_STATIC_NEW(teca_table_reader)
88  TECA_ALGORITHM_DELETE_COPY_ASSIGN(teca_table_reader)
89  TECA_ALGORITHM_CLASS_NAME(teca_table_reader)
91 
92  // report/initialize to/from Boost program options
93  // objects.
94  TECA_GET_ALGORITHM_PROPERTIES_DESCRIPTION()
95  TECA_SET_ALGORITHM_PROPERTIES()
96 
97  // the file from which data will be read.
98  TECA_ALGORITHM_PROPERTY(std::string, file_name)
99 
100  // name of the column containing index values.
101  // if this is not empty the reader will operate
102  // in parallel mode serving up requested indices
103  // on demand. otherwise rank 0 reads the entire
104  // table regardless of what is requested.
105  TECA_ALGORITHM_PROPERTY(std::string, index_column)
106 
107  // when set a column named "original_ids" is placed
108  // into the output. values map back to the row number
109  // of the source dataset. By default this is off.
110  TECA_ALGORITHM_PROPERTY(int, generate_original_ids)
111 
112  // name of columns to copy directly into metadata
113  TECA_ALGORITHM_VECTOR_PROPERTY(std::string, metadata_column_name)
114 
115  // keys that identify metadata columns
116  TECA_ALGORITHM_VECTOR_PROPERTY(std::string, metadata_column_key)
117 
118  // add a metadata column with the given key
119  void add_metadata_column(const std::string &column, const std::string &key)
120  {
121  this->append_metadata_column_name(column);
122  this->append_metadata_column_key(key);
123  }
124 
125  // removes all metadata columns
126  void clear_metadata_columns()
127  {
128  this->clear_metadata_column_names();
129  this->clear_metadata_column_keys();
130  }
131 
132  // Select the output file format. 0 : csv, 1 : bin, 2 : xlsx, 3 : auto
133  // the default is csv.
134  enum {format_csv, format_bin, format_xlsx, format_auto};
135  TECA_ALGORITHM_PROPERTY(int, file_format)
136  void set_file_format_csv(){ this->set_file_format(format_csv); }
137  void set_file_format_bin(){ this->set_file_format(format_bin); }
138  void set_file_format_xlsx(){ this->set_file_format(format_xlsx); }
139  void set_file_format_auto(){ this->set_file_format(format_auto); }
140 
141 protected:
143 
144 private:
145  teca_metadata get_output_metadata(unsigned int port,
146  const std::vector<teca_metadata> &input_md) override;
147 
148  const_p_teca_dataset execute(unsigned int port,
149  const std::vector<const_p_teca_dataset> &input_data,
150  const teca_metadata &request) override;
151 
152  void set_modified() override;
153  void clear_cached_metadata();
154 
155 private:
156  std::string file_name;
157  std::string index_column;
158  int generate_original_ids;
159  int file_format;
160  std::vector<std::string> metadata_column_names;
161  std::vector<std::string> metadata_column_keys;
162 
163  struct teca_table_reader_internals;
164  teca_table_reader_internals *internals;
165 };
166 
167 #endif
teca_metadata
A generic container for meta data in the form of name=value pairs.
Definition: teca_metadata.h:18
teca_table_reader
a reader for data stored in binary table format
Definition: teca_table_reader.h:84
teca_shared_object.h
teca_algorithm
The interface to TECA pipeline architecture.
Definition: teca_algorithm.h:237