24 #ifndef __CSV_FEATURE_MANAGER_H__
25 #define __CSV_FEATURE_MANAGER_H__
27 #include "data_management/data/numeric_table.h"
28 #include "data_management/features/shortcuts.h"
29 #include "data_management/data_source/data_source.h"
30 #include "data_management/data_source/internal/csv_feature_utils.h"
31 #include "data_management/data_source/modifiers/csv/shortcuts.h"
32 #include "data_management/data_source/modifiers/csv/internal/engine.h"
36 namespace data_management
47 dsFeat(0), ntFeat(0), nCats(0) { }
49 explicit FeatureAuxData(
size_t index,
50 DataSourceFeature *dataSourceFeature,
51 NumericTableFeature *numericTableFeature) :
53 dsFeat(dataSourceFeature),
54 ntFeat(numericTableFeature),
55 wide(1), nCats(0), buffer() { }
60 DataSourceFeature *dsFeat;
61 NumericTableFeature *ntFeat;
65 typedef void (*functionT)(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr);
74 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const = 0;
76 virtual ~ModifierIface() {}
78 static void contFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
81 readNumeric<>( word, f );
85 static void catFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
87 aux.buffer.assign(word);
89 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
90 CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
92 if( it != catDict->end() )
94 arr[ aux.idx ] = (DAAL_DATA_TYPE)it->second.first;
99 int index = (int)(catDict->size());
100 catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>(index, 1) ) );
101 arr[ aux.idx ] = (DAAL_DATA_TYPE)index;
102 aux.ntFeat->categoryNumber = index + 1;
106 static void nullFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr) { }
110 static void readNumeric(
const char *text, T &f)
112 f = daal::services::daal_string_to_float(text, 0);
115 static void binFunc(
const char* word, FeatureAuxData& aux, DAAL_DATA_TYPE* arr)
117 aux.buffer.assign(word);
119 CategoricalFeatureDictionary *catDict = aux.dsFeat->getCategoricalDictionary();
120 CategoricalFeatureDictionary::iterator it = catDict->find( aux.buffer );
124 if( it != catDict->end() )
126 index = it->second.first;
131 index = catDict->size();
132 catDict->insert( std::pair<std::string, std::pair<int, int> >( aux.buffer, std::pair<int, int>((
int)index, 1) ) );
133 aux.ntFeat->categoryNumber = index + 1;
136 size_t nCats = aux.nCats;
138 for(
size_t i=0; i<nCats; i++)
140 arr[ aux.idx + i ] = (DAAL_DATA_TYPE)(i == index);
149 class MakeCategorical :
public ModifierIface
153 MakeCategorical(
size_t idx) : idx(idx) {}
155 virtual ~MakeCategorical() {}
157 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
159 size_t nCols = funcList.size();
163 funcList[idx] = catFunc;
164 auxVect[idx].buffer.resize(1024);
173 class OneHotEncoder :
public ModifierIface
178 OneHotEncoder(
size_t idx,
size_t nCats) : idx(idx), nCats(nCats) {}
180 virtual ~OneHotEncoder() {}
182 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
184 size_t nCols = funcList.size();
188 funcList[idx] = binFunc;
189 auxVect[idx].buffer.resize(1024);
190 auxVect[idx].nCats = nCats;
191 auxVect[idx].wide = nCats;
195 for(
size_t i=0; i<nCols; i++)
197 auxVect[i].idx = nNTCols;
198 nNTCols += auxVect[i].wide;
207 class ColumnFilter :
public ModifierIface
213 services::Collection<size_t> validList;
215 ColumnFilter() : oddFlag(
false), evenFlag(
false), noneFlag(
false), listFlag(
false) {}
217 virtual ~ColumnFilter() {}
219 ColumnFilter& odd() { oddFlag=
true;
return *
this;}
220 ColumnFilter& even() {evenFlag=
true;
return *
this;}
221 ColumnFilter& none() {noneFlag=
true;
return *
this;}
222 ColumnFilter& list(services::Collection<size_t> valid) {validList=valid; listFlag=
true;
return *
this;}
224 virtual void apply(services::Collection<functionT> &funcList, services::Collection<FeatureAuxData> &auxVect)
const
226 size_t nCols = funcList.size();
230 for(
size_t i=0; i<nCols; i+=2)
232 funcList[i] = nullFunc;
239 for(
size_t i=1; i<nCols; i+=2)
241 funcList[i] = nullFunc;
248 for(
size_t i=0; i<nCols; i++)
250 funcList[i] = nullFunc;
257 services::Collection<bool> flags(nCols);
259 for(
size_t i=0; i<nCols; i++)
264 for(
size_t i=0; i<validList.size(); i++)
266 size_t el = validList[i];
273 for(
size_t i=0; i<nCols; i++)
275 if(flags[i])
continue;
276 funcList[i] = nullFunc;
282 for(
size_t i=0; i<nCols; i++)
284 auxVect[i].idx = nNTCols;
285 nNTCols += auxVect[i].wide;
303 class CSVFeatureManager :
public StringRowFeatureManagerIface
309 CSVFeatureManager() :
312 _isHeaderParsed(false) { }
314 virtual ~CSVFeatureManager() { }
319 void setDelimiter(
char delimiter )
321 _delimiter = delimiter;
329 size_t getNumericTableNumberOfColumns()
const
331 if (_modifiersManager)
333 return _modifiersManager->getNumberOfOutputFeatures();
336 const size_t nDSCols = auxVect.size();
337 return auxVect[nDSCols - 1].idx + auxVect[nDSCols - 1].wide;
344 services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
346 DAAL_CHECK( dictionary, services::ErrorNullPtr );
350 fillAuxVectAndFuncList(*dictionary);
351 _numberOfTokens = dictionary->getNumberOfFeatures();
353 return services::Status();
360 void addModifier(
const ModifierIface &modifier)
362 modifier.apply(funcList, auxVect);
372 CSVFeatureManager &addModifier(
const features::FeatureIdCollectionIfacePtr &featureIds,
373 const modifiers::csv::FeatureModifierIfacePtr &modifier,
374 services::Status *status = NULL)
376 services::Status localStatus;
377 if (!_modifiersManager)
379 _modifiersManager = modifiers::csv::internal::ModifiersManager::create(&localStatus);
382 services::internal::tryAssignStatusAndThrow(status, localStatus);
387 localStatus |= _modifiersManager->addModifier(featureIds, modifier);
390 services::internal::tryAssignStatusAndThrow(status, localStatus);
402 void parseRowAsHeader(
char *rawRowData,
size_t rawDataSize)
404 DAAL_ASSERT( rawRowData );
406 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
407 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
409 _featuresInfo.addFeatureName(tokenizer.getCurrentToken());
419 virtual void parseRowAsDictionary(
char *rawRowData,
size_t rawDataSize,
420 DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
422 DAAL_ASSERT( rawRowData );
423 DAAL_ASSERT( dictionary );
427 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
428 for (tokenizer.reset(); tokenizer.good(); tokenizer.next())
431 _featuresInfo.addFeatureType(tokenizer.getCurrentToken());
434 if (_modifiersManager)
436 _modifiersManager->prepare(_featuresInfo);
437 _modifiersManager->fillDictionary(*dictionary);
441 fillDictionaryWithoutModifiers(*dictionary);
453 virtual void parseRowIn(
char *rawRowData,
size_t rawDataSize, DataSourceDictionary *dictionary,
454 NumericTable *nt,
size_t ntRowIndex) DAAL_C11_OVERRIDE
457 DAAL_ASSERT( dictionary );
458 DAAL_ASSERT( rawRowData );
460 nt->getBlockOfRows(ntRowIndex, 1, writeOnly, _currentRowBlock);
461 services::BufferView<DAAL_DATA_TYPE> rowBuffer(_currentRowBlock.getBlockPtr(),
462 _currentRowBlock.getNumberOfColumns());
465 internal::CSVRowTokenizer tokenizer(rawRowData, rawDataSize, _delimiter);
467 if (_modifiersManager)
469 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
471 _modifiersManager->setToken(i, tokenizer.getCurrentToken());
473 _modifiersManager->applyModifiers(rowBuffer);
477 DAAL_DATA_TYPE *row = rowBuffer.data();
478 for (tokenizer.reset(); tokenizer.good() && i < _numberOfTokens; tokenizer.next(), i++)
480 const services::StringView token = tokenizer.getCurrentToken();
481 funcList[i](token.c_str(), auxVect[i], row);
485 nt->releaseBlockOfRows(_currentRowBlock);
492 void finalize(DataSourceDictionary *dictionary)
494 if (_modifiersManager)
496 _modifiersManager->finalize();
497 _modifiersManager->fillDictionary(*dictionary);
502 void fillDictionaryWithoutModifiers(DataSourceDictionary &dictionary)
504 const size_t nFeatures = _featuresInfo.getNumberOfFeatures();
505 dictionary.setNumberOfFeatures(nFeatures);
507 for (
size_t i = 0; i < nFeatures; i++)
509 features::FeatureType fType = _featuresInfo.getDetectedFeatureType(i);
510 dictionary[i].ntFeature.featureType = fType;
514 case features::DAAL_CONTINUOUS:
515 dictionary[i].ntFeature.setType<DAAL_DATA_TYPE>();
518 case features::DAAL_ORDINAL:
519 case features::DAAL_CATEGORICAL:
520 dictionary[i].ntFeature.setType<
int>();
525 fillAuxVectAndFuncList(dictionary);
528 void fillAuxVectAndFuncList(DataSourceDictionary &dictionary)
530 const size_t nFeatures = dictionary.getNumberOfFeatures();
531 auxVect.resize(nFeatures);
532 funcList.resize(nFeatures);
534 for (
size_t i = 0; i < nFeatures; i++)
536 DataSourceFeature &feature = dictionary[i];
537 NumericTableFeature &ntFeature = feature.ntFeature;
539 auxVect.push_back(FeatureAuxData(i, &feature, &ntFeature));
540 funcList.push_back(getModifierFunctionPtr(ntFeature));
544 static functionT getModifierFunctionPtr(
const NumericTableFeature &ntFeature)
546 switch (ntFeature.featureType)
548 case features::DAAL_CONTINUOUS:
549 return ModifierIface::contFunc;
551 case features::DAAL_ORDINAL:
552 case features::DAAL_CATEGORICAL:
553 return ModifierIface::catFunc;
555 return ModifierIface::nullFunc;
560 services::Collection<functionT> funcList;
561 services::Collection<FeatureAuxData> auxVect;
564 bool _isHeaderParsed;
565 size_t _numberOfTokens;
566 BlockDescriptor<DAAL_DATA_TYPE> _currentRowBlock;
568 internal::CSVFeaturesInfo _featuresInfo;
569 modifiers::csv::internal::ModifiersManagerPtr _modifiersManager;
574 using interface1::CSVFeatureManager;
daal::data_management::interface1::CSVFeatureManager::getNumericTableNumberOfColumns
size_t getNumericTableNumberOfColumns() const
Definition: csv_feature_manager.h:329
daal::data_management::interface1::CSVFeatureManager
Methods of the class to preprocess data represented in the CSV format.
Definition: csv_feature_manager.h:303
daal::data_management::interface1::CSVFeatureManager::setFeatureDetailsFromDictionary
services::Status setFeatureDetailsFromDictionary(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:344
daal::data_management::interface1::CSVFeatureManager::parseRowAsDictionary
virtual void parseRowAsDictionary(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:419
daal::data_management::interface1::Dictionary
Class that represents a dictionary of a data set and provides methods to work with the data dictionar...
Definition: data_dictionary.h:163
daal::data_management::interface1::NumericTable
Class for a data management component responsible for representation of data in the numeric format...
Definition: numeric_table.h:577
daal::data_management::FeatureAuxData
Structure for auxiliary data used for feature extraction.
Definition: csv_feature_manager.h:43
daal::services::ErrorNullPtr
Definition: error_indexes.h:141
daal::data_management::interface1::CSVFeatureManager::parseRowIn
virtual void parseRowIn(char *rawRowData, size_t rawDataSize, DataSourceDictionary *dictionary, NumericTable *nt, size_t ntRowIndex) DAAL_C11_OVERRIDE
Definition: csv_feature_manager.h:453
daal::data_management::ModifierIface
Abstract interface class that defines the interface for a features modifier.
Definition: csv_feature_manager.h:71
daal::data_management::interface1::CSVFeatureManager::finalize
void finalize(DataSourceDictionary *dictionary)
Definition: csv_feature_manager.h:492
daal::data_management::interface1::CSVFeatureManager::addModifier
CSVFeatureManager & addModifier(const features::FeatureIdCollectionIfacePtr &featureIds, const modifiers::csv::FeatureModifierIfacePtr &modifier, services::Status *status=NULL)
Definition: csv_feature_manager.h:372
daal::data_management::interface1::Dictionary::setNumberOfFeatures
virtual services::Status setNumberOfFeatures(size_t numberOfFeatures)
Definition: data_dictionary.h:268
daal::data_management::MakeCategorical
Methods of the class to set a feature categorical.
Definition: csv_feature_manager.h:149
daal::data_management::interface1::CSVFeatureManager::CSVFeatureManager
CSVFeatureManager()
Definition: csv_feature_manager.h:309
daal::data_management::interface1::CSVFeatureManager::addModifier
void addModifier(const ModifierIface &modifier)
Definition: csv_feature_manager.h:360
daal::data_management::interface1::CSVFeatureManager::setDelimiter
void setDelimiter(char delimiter)
Definition: csv_feature_manager.h:319
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:39
daal::data_management::interface1::StringRowFeatureManagerIface
Abstract interface class that defines the interface to parse and convert the raw data represented as ...
Definition: data_source_utils.h:46
daal::data_management::interface1::CSVFeatureManager::parseRowAsHeader
void parseRowAsHeader(char *rawRowData, size_t rawDataSize)
Definition: csv_feature_manager.h:402
daal::data_management::interface1::BlockDescriptor::getNumberOfColumns
size_t getNumberOfColumns() const
Definition: numeric_table.h:97
daal::data_management::interface1::Dictionary::getNumberOfFeatures
size_t getNumberOfFeatures() const
Definition: data_dictionary.h:287
daal::data_management::interface1::BlockDescriptor::getBlockPtr
DataType * getBlockPtr() const
Definition: numeric_table.h:71
daal::data_management::ColumnFilter
Methods of the class to filter out data source features from output numeric table.
Definition: csv_feature_manager.h:207
daal::data_management::OneHotEncoder
Methods of the class to set a feature binary categorical.
Definition: csv_feature_manager.h:173