TECA
The Toolkit for Extreme Climate Analysis
teca_table_reader.h
1 #ifndef teca_table_reader_h
2 #define teca_table_reader_h
3 
4 #include "teca_config.h"
5 #include "teca_algorithm.h"
6 #include "teca_metadata.h"
7 #include "teca_shared_object.h"
8 #include "teca_table.h"
9 
10 #include <vector>
11 #include <string>
12 #include <mutex>
13 
14 
15 TECA_SHARED_OBJECT_FORWARD_DECL(teca_table_reader)
16 
17 /// a reader for data stored in binary table format
18 /**
19  * A reader for data stored in CSV or binary table format. By default the reader
20  * reads and returns the entire table on rank 0. The reader can partition the
21  * data across an "index column". The index column assigns a unique id to rows
22  * that should be returned together. The reader reports the number of unique ids
23  * to the pipeline which can then be requested by the pipeline during parallel or
24  * sequential execution.
25  *
26  * output:
27  * generates a table containing the data read from the file.
28  *
29  *
30  * ### TECA CSV format specification
31  *
32  * #### Comment lines
33  *
34  * a '#' character at the start of a line marks it as a comment. The version of
35  * the CSV specification as well as the version of TECA used to write the table
36  * will be stored in comment lines. Comment lines are currently skipped when
37  * reading the table.
38  *
39  * #### Column definitions
40  *
41  * the first row stores the names and data types of the columns. Column names are
42  * strings and delimited by double quotes. A column's data type is
43  * encoded in the name using (N) where N is an integer type code defined by
44  * teca_variant_array and parentheses delimit the type code. The type code
45  * sequence is stripped from the name when the file is read.
46  *
47  * | C type | code |
48  * | ------ | ---- |
49  * | char | 1 |
50  * | unsigned char | 2 |
51  * | int | 3 |
52  * | unsigned int | 4 |
53  * | short int | 5 |
54  * | short unsigned int | 6 |
55  * | long | 7 |
56  * | unsigned long | 8 |
57  * | long long | 9 |
58  * | unsigned long long | 10 |
59  * | float | 11 |
60  * | double | 12 |
61  * | std::string | 13 |
62  *
63  * The number of column definitions found determines the number of columns in the
64  * table when reading.
65  *
66  * #### Column data
67  *
68  * Data is organized row by row with an entry for each column. Entries are
69  * separated by commas ','. Error's will occur when the number of column
70  * definitions don't match the number of data entries per row.
71  *
72  * #### String data
73  *
74  * Strings are delimited by double quotations. Double quotes and commas in strings
75  * may be escaped by a backslash.
76  *
77  * #### Numeric data
78  *
79  * The type code provided in the column definition tells the type of number.
80  * These codes are defined in teca_variant_array.
81  *
82  * floating point types are written with format and precision such that they
83  * may be read without introducing rounding error.
84  */
86 {
87 public:
88  TECA_ALGORITHM_STATIC_NEW(teca_table_reader)
89  TECA_ALGORITHM_DELETE_COPY_ASSIGN(teca_table_reader)
90  TECA_ALGORITHM_CLASS_NAME(teca_table_reader)
92 
93  // report/initialize to/from Boost program options
94  // objects.
95  TECA_GET_ALGORITHM_PROPERTIES_DESCRIPTION()
96  TECA_SET_ALGORITHM_PROPERTIES()
97 
98  // the file from which data will be read.
99  TECA_ALGORITHM_PROPERTY(std::string, file_name)
100 
101  // name of the column containing index values.
102  // if this is not empty the reader will operate
103  // in parallel mode serving up requested indices
104  // on demand. otherwise rank 0 reads the entire
105  // table regardless of what is requested.
106  TECA_ALGORITHM_PROPERTY(std::string, index_column)
107 
108  // when set a column named "original_ids" is placed
109  // into the output. values map back to the row number
110  // of the source dataset. By default this is off.
111  TECA_ALGORITHM_PROPERTY(int, generate_original_ids)
112 
113  // name of columns to copy directly into metadata
114  TECA_ALGORITHM_VECTOR_PROPERTY(std::string, metadata_column_name)
115 
116  // keys that identify metadata columns
117  TECA_ALGORITHM_VECTOR_PROPERTY(std::string, metadata_column_key)
118 
119  // add a metadata column with the given key
120  void add_metadata_column(const std::string &column, const std::string &key)
121  {
122  this->append_metadata_column_name(column);
123  this->append_metadata_column_key(key);
124  }
125 
126  // removes all metadata columns
127  void clear_metadata_columns()
128  {
129  this->clear_metadata_column_names();
130  this->clear_metadata_column_keys();
131  }
132 
133  // Select the output file format. 0 : csv, 1 : bin, 2 : xlsx, 3 : auto
134  // the default is csv.
135  enum {format_csv, format_bin, format_xlsx, format_auto};
136  TECA_ALGORITHM_PROPERTY(int, file_format)
137  void set_file_format_csv(){ this->set_file_format(format_csv); }
138  void set_file_format_bin(){ this->set_file_format(format_bin); }
139  void set_file_format_xlsx(){ this->set_file_format(format_xlsx); }
140  void set_file_format_auto(){ this->set_file_format(format_auto); }
141 
142 protected:
144 
145 private:
147 
148  teca_metadata get_output_metadata(unsigned int port,
149  const std::vector<teca_metadata> &input_md) override;
150 
151  const_p_teca_dataset execute(unsigned int port,
152  const std::vector<const_p_teca_dataset> &input_data,
153  const teca_metadata &request) override;
154 
155  void set_modified() override;
156  void clear_cached_metadata();
157 
158 private:
159  std::string file_name;
160  std::string index_column;
161  int generate_original_ids;
162  int file_format;
163  std::vector<std::string> metadata_column_names;
164  std::vector<std::string> metadata_column_keys;
165 
166  struct teca_table_reader_internals;
167  teca_table_reader_internals *internals;
168 };
169 
170 #endif
The interface to TECA pipeline architecture.
Definition: teca_algorithm.h:244
virtual teca_metadata get_output_metadata(unsigned int port, const std::vector< teca_metadata > &input_md)
A generic container for meta data in the form of name=value pairs.
Definition: teca_metadata.h:22
a reader for data stored in binary table format
Definition: teca_table_reader.h:86
p_teca_error_handler error_handler TECA_EXPORT
The global error handler instance.