C++ API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

csv_feature_manager.h
1 /* file: csv_feature_manager.h */
2 /*******************************************************************************
3 * Copyright 2014-2020 Intel Corporation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 /*
19 //++
20 // Implementation of the CSV feature manager class.
21 //--
22 */
23 
24 #ifndef __CSV_FEATURE_MANAGER_H__
25 #define __CSV_FEATURE_MANAGER_H__
26 
27 #include "data_management/data/numeric_table.h"
28 #include "data_management/features/shortcuts.h"
29 #include "data_management/data_source/data_source.h"
30 #include "data_management/data_source/internal/csv_feature_utils.h"
31 #include "data_management/data_source/modifiers/csv/shortcuts.h"
32 #include "data_management/data_source/modifiers/csv/internal/engine.h"
33 
34 namespace daal
35 {
36 namespace data_management
37 {
38 
43 struct FeatureAuxData
44 {
45  FeatureAuxData() :
46  idx(0), wide(1),
47  dsFeat(0), ntFeat(0), nCats(0) { }
48 
49  explicit FeatureAuxData(size_t index,
50  DataSourceFeature *dataSourceFeature,
51  NumericTableFeature *numericTableFeature) :
52  idx(index),
53  dsFeat(dataSourceFeature),
54  ntFeat(numericTableFeature),
55  wide(1), nCats(0), buffer() { }
56 
57  size_t idx;
58  size_t wide;
59  size_t nCats;
60  DataSourceFeature *dsFeat;
61  NumericTableFeature *ntFeat;
62  std::string buffer;
63 };
64 
65 typedef void (*functionT)(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
66 
71 class ModifierIface
72 {
73 public:
74  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const = 0;
75 
76  virtual ~ModifierIface() {}
77 
78  static void contFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
79  {
80  DAAL_DATA_TYPE f;
81  readNumeric<>( word, f );
82  arr[ aux.idx ] = f;
83  }
84 
85  static void catFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
86  {
87  aux.buffer.assign(word);
88 
89  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
90  CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
91 
92  if( it != catDict->end() )
93  {
94  arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
95  it->second.second++;
96  }
97  else
98  {
99  int index = (int)(catDict->size());
100  catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>(index, 1) ) );
101  arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
102  aux.ntFeat->categoryNumber = index + 1;
103  }
104  }
105 
106  static void nullFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
107 
108 protected:
109  template<class T>
110  static void readNumeric(const char *text, T &f)
111  {
112  f = daal::services::daal_string_to_float(text, 0);
113  }
114 
115  static void binFunc(const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
116  {
117  aux.buffer.assign(word);
118 
119  CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
120  CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
121 
122  size_t index = 0;
123 
124  if( it != catDict->end() )
125  {
126  index = it->second.first;
127  it->second.second++;
128  }
129  else
130  {
131  index = catDict->size();
132  catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>((int)index, 1) ) );
133  aux.ntFeat->categoryNumber = index + 1;
134  }
135 
136  size_t nCats = aux.nCats;
137 
138  for(size_t i=0; i<nCats; i++)
139  {
140  arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
141  }
142  }
143 };
144 
149 class MakeCategorical : public ModifierIface
150 {
151  size_t idx;
152 public:
153  MakeCategorical(size_t idx) : idx(idx) {}
154 
155  virtual ~MakeCategorical() {}
156 
157  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
158  {
159  size_t nCols = funcList.size();
160 
161  if(idx < nCols)
162  {
163  funcList[idx] = catFunc;
164  auxVect[idx].buffer.resize(1024);
165  }
166  }
167 };
168 
173 class OneHotEncoder : public ModifierIface
174 {
175  size_t idx;
176  size_t nCats;
177 public:
178  OneHotEncoder(size_t idx, size_t nCats) : idx(idx), nCats(nCats) {}
179 
180  virtual ~OneHotEncoder() {}
181 
182  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
183  {
184  size_t nCols = funcList.size();
185 
186  if(idx < nCols)
187  {
188  funcList[idx] = binFunc;
189  auxVect[idx].buffer.resize(1024);
190  auxVect[idx].nCats = nCats;
191  auxVect[idx].wide = nCats;
192  }
193 
194  size_t nNTCols = 0;
195  for(size_t i=0; i<nCols; i++)
196  {
197  auxVect[i].idx = nNTCols;
198  nNTCols += auxVect[i].wide;
199  }
200  }
201 };
202 
207 class ColumnFilter : public ModifierIface
208 {
209  bool oddFlag;
210  bool evenFlag;
211  bool noneFlag;
212  bool listFlag;
213  services::Collection<size_t> validList;
214 public:
215  ColumnFilter() : oddFlag(false), evenFlag(false), noneFlag(false), listFlag(false) {}
216 
217  virtual ~ColumnFilter() {}
218 
219  ColumnFilter& odd() { oddFlag=true; return *this;}
220  ColumnFilter& even() {evenFlag=true; return *this;}
221  ColumnFilter& none() {noneFlag=true; return *this;}
222  ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=true; return *this;}
223 
224  virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect) const
225  {
226  size_t nCols = funcList.size();
227 
228  if( oddFlag )
229  {
230  for(size_t i=0; i<nCols; i+=2)
231  {
232  funcList[i] = nullFunc;
233  auxVect[i].wide = 0;
234  }
235  }
236 
237  if( evenFlag )
238  {
239  for(size_t i=1; i<nCols; i+=2)
240  {
241  funcList[i] = nullFunc;
242  auxVect[i].wide = 0;
243  }
244  }
245 
246  if( noneFlag )
247  {
248  for(size_t i=0; i<nCols; i++)
249  {
250  funcList[i] = nullFunc;
251  auxVect[i].wide = 0;
252  }
253  }
254 
255  if( listFlag )
256  {
257  services::Collection<bool> flags(nCols);
258 
259  for(size_t i=0; i<nCols; i++)
260  {
261  flags[i] = false;
262  }
263 
264  for(size_t i=0; i<validList.size(); i++)
265  {
266  size_t el = validList[i];
267  if(el<nCols)
268  {
269  flags[el] = true;
270  }
271  }
272 
273  for(size_t i=0; i<nCols; i++)
274  {
275  if(flags[i]) continue;
276  funcList[i] = nullFunc;
277  auxVect[i].wide = 0;
278  }
279  }
280 
281  size_t nNTCols = 0;
282  for(size_t i=0; i<nCols; i++)
283  {
284  auxVect[i].idx = nNTCols;
285  nNTCols += auxVect[i].wide;
286  }
287  }
288 };
289 
290 namespace interface1
291 {
292 
303 class CSVFeatureManager : public StringRowFeatureManagerIface
304 {
305 public:
309  CSVFeatureManager() :
310  _delimiter(','),
311  _numberOfTokens(0),
312  _isHeaderParsed(false) { }
313 
314  virtual ~CSVFeatureManager() { }
315 
319  void setDelimiter( char delimiter )
320  {
321  _delimiter = delimiter;
322  }
323 
324 public:
329  size_t getNumericTableNumberOfColumns() const
330  {
331  if (_modifiersManager)
332  {
333  return _modifiersManager->getNumberOfOutputFeatures();
334  }
335 
336  const size_t nDSCols = auxVect.size();
337  return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
338  }
339 
344  services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
345  {
346  DAAL_CHECK( dictionary, services::ErrorNullPtr );
347 
348  auxVect.clear();
349  funcList.clear();
350  fillAuxVectAndFuncList(*dictionary);
351  _numberOfTokens = dictionary->getNumberOfFeatures();
352 
353  return services::Status();
354  }
355 
360  void addModifier(const ModifierIface &modifier)
361  {
362  modifier.apply(funcList, auxVect);
363  }
364 
372  CSVFeatureManager &addModifier(const features::FeatureIdCollectionIfacePtr &featureIds,
373  const modifiers::csv::FeatureModifierIfacePtr &modifier,
374  services::Status *status = NULL)
375  {
376  services::Status localStatus;
377  if (!_modifiersManager)
378  {
379  _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
380  if (!localStatus)
381  {
382  services::internal::tryAssignStatusAndThrow(status, localStatus);
383  return *this;
384  }
385  }
386 
387  localStatus |= _modifiersManager->addModifier(featureIds, modifier);
388  if (!localStatus)
389  {
390  services::internal::tryAssignStatusAndThrow(status, localStatus);
391  return *this;
392  }
393 
394  return *this;
395  }
396 
402  void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
403  {
404  DAAL_ASSERT( rawRowData );
405 
406  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
407  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
408  {
409  _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
410  }
411  }
412 
419  virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize,
420  DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
421  {
422  DAAL_ASSERT( rawRowData );
423  DAAL_ASSERT( dictionary );
424 
425  _numberOfTokens = 0;
426 
427  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
428  for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
429  {
430  _numberOfTokens++;
431  _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
432  }
433 
434  if (_modifiersManager)
435  {
436  _modifiersManager->prepare(_featuresInfo);
437  _modifiersManager->fillDictionary(*dictionary);
438  }
439  else
440  {
441  fillDictionaryWithoutModifiers(*dictionary);
442  }
443  }
444 
453  virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary,
454  NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
455  {
456  DAAL_ASSERT( nt );
457  DAAL_ASSERT( dictionary );
458  DAAL_ASSERT( rawRowData );
459 
460  nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
461  services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
462  _currentRowBlock.getNumberOfColumns());
463 
464  size_t i = 0;
465  internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
466 
467  if (_modifiersManager)
468  {
469  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
470  {
471  _modifiersManager->setToken(i, tokenizer.getCurrentToken());
472  }
473  _modifiersManager->applyModifiers(rowBuffer);
474  }
475  else
476  {
477  DAAL_DATA_TYPE *row = rowBuffer.data();
478  for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
479  {
480  const services::StringView token = tokenizer.getCurrentToken();
481  funcList[i](token.c_str(), auxVect[i], row);
482  }
483  }
484 
485  nt->releaseBlockOfRows(_currentRowBlock);
486  }
487 
492  void finalize(DataSourceDictionary *dictionary)
493  {
494  if (_modifiersManager)
495  {
496  _modifiersManager->finalize();
497  _modifiersManager->fillDictionary(*dictionary);
498  }
499  }
500 
501 private:
502  void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
503  {
504  const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
505  dictionary.setNumberOfFeatures(nFeatures);
506 
507  for (size_t i = 0; i < nFeatures; i++)
508  {
509  features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
510  dictionary[i].ntFeature.featureType = fType;
511 
512  switch (fType)
513  {
514  case features::DAAL_CONTINUOUS:
515  dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
516  break;
517 
518  case features::DAAL_ORDINAL:
519  case features::DAAL_CATEGORICAL:
520  dictionary[i].ntFeature.setType<int>();
521  break;
522  }
523  }
524 
525  fillAuxVectAndFuncList(dictionary);
526  }
527 
528  void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
529  {
530  const size_t nFeatures = dictionary.getNumberOfFeatures();
531  auxVect.resize(nFeatures);
532  funcList.resize(nFeatures);
533 
534  for (size_t i = 0; i < nFeatures; i++)
535  {
536  DataSourceFeature &feature = dictionary[i];
537  NumericTableFeature &ntFeature = feature.ntFeature;
538 
539  auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
540  funcList.push_back(getModifierFunctionPtr(ntFeature));
541  }
542  }
543 
544  static functionT getModifierFunctionPtr(const NumericTableFeature &ntFeature)
545  {
546  switch (ntFeature.featureType)
547  {
548  case features::DAAL_CONTINUOUS:
549  return ModifierIface::contFunc;
550 
551  case features::DAAL_ORDINAL:
552  case features::DAAL_CATEGORICAL:
553  return ModifierIface::catFunc;
554  }
555  return ModifierIface::nullFunc;
556  }
557 
558 protected:
559  char _delimiter;
560  services::Collection<functionT> funcList;
561  services::Collection<FeatureAuxData> auxVect;
562 
563 private:
564  bool _isHeaderParsed;
565  size_t _numberOfTokens;
566  BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
567 
568  internal::CSVFeaturesInfo _featuresInfo;
569  modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
570 };
572 } // namespace interface1
573 
574 using interface1::CSVFeatureManager;
575 
576 } // namespace data_management
577 } // namespace daal
578 
579 #endif
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:329
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:303
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:344
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:419
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:163
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:577
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:43
daal::services::ErrorNullPtr
Definition: error_indexes.h:141
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:453
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:71
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:492
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:372
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:268
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:149
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:309
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:360
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:319
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:39
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:46
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:402
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:97
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:287
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:71
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:207
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:173

For more complete information about compiler optimizations, see our Optimization Notice.