18 #ifndef __CSV_FEATURE_UTILS_H__
19 #define __CSV_FEATURE_UTILS_H__
23 #include "services/collection.h"
24 #include "services/daal_string.h"
25 #include "data_management/features/defines.h"
29 namespace data_management
39 class CSVRowTokenizer :
public Base
43 const size_t _rawDataSize;
44 const char _delimiter;
52 explicit CSVRowTokenizer(
char *rawData,
size_t rawDataSize,
char delimiter) :
54 _rawDataSize(rawDataSize),
55 _delimiter(delimiter),
71 DAAL_FORCEINLINE
void next()
76 if (!good()) {
return; }
80 while (isValidSymbol(_pos) && !isStopSymbol(_pos))
83 _tokenSize = _pos - _prevPos;
84 _goodFlag = isValidSymbol(_prevPos);
86 if (isValidSymbol(_pos) && isStopSymbol(_pos))
88 _rawData[_pos] =
'\0';
93 DAAL_FORCEINLINE
bool good()
const
98 DAAL_FORCEINLINE services::StringView getCurrentToken()
const
100 return services::StringView(_rawData + _prevPos, _tokenSize);
104 DAAL_FORCEINLINE
bool isValidSymbol(
size_t index)
const
106 return index < _rawDataSize &&
107 _rawData[index] !=
'\0';
110 DAAL_FORCEINLINE
bool isStopSymbol(
size_t index)
const
112 return _rawData[index] == _delimiter;
115 CSVRowTokenizer(
const CSVRowTokenizer &);
116 CSVRowTokenizer &operator=(
const CSVRowTokenizer &);
123 class CSVFeaturesInfo :
public Base
126 services::Status addFeatureName(
const services::StringView &featureName)
128 services::Status status = services::internal::checkForNullByteInjection(featureName.begin(), featureName.end());
131 return services::throwIfPossible(services::ErrorNullByteInjection);
133 const services::String featureNameStr(featureName.begin());
134 if ( !_featureNames.safe_push_back(featureNameStr) )
136 return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
138 return services::Status();
141 services::Status addFeatureType(
const services::StringView &token)
143 const features::FeatureType featureType = detectFeatureType(token);
144 if ( !_featureTypes.safe_push_back(featureType) )
146 return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
148 return services::Status();
151 size_t getNumberOfFeatures()
const
154 if (_featureNames.size() != 0)
156 DAAL_ASSERT( _featureNames.size() == _featureTypes.size() );
157 return _featureNames.size();
159 return _featureTypes.size();
162 const services::String &getFeatureName(
size_t featureIndex)
const
164 DAAL_ASSERT( _featureNames.size() == 0 ||
165 _featureNames.size() == _featureTypes.size() );
166 DAAL_ASSERT( featureIndex < _featureNames.size() );
167 return _featureNames[featureIndex];
170 features::FeatureType getDetectedFeatureType(
size_t featureIndex)
const
172 DAAL_ASSERT( featureIndex < _featureTypes.size() );
173 return _featureTypes[featureIndex];
176 bool areFeatureNamesAvailable()
const
178 return _featureNames.size() > 0;
182 static features::FeatureType detectFeatureType(
const services::StringView &token)
184 return isNumericalFeature(token)
185 ? features::DAAL_CONTINUOUS
186 : features::DAAL_CATEGORICAL;
189 static bool isNumericalFeature(
const services::StringView &token)
191 std::istringstream iss(token.c_str());
192 DAAL_DATA_TYPE f = 0.0; iss >> f;
193 return !(iss.fail());
197 services::Collection<services::String> _featureNames;
198 services::Collection<features::FeatureType> _featureTypes;
daal::data_management::internal::CSVFeaturesInfo
Class that holds auxiliary information about features being parsed.
Definition: csv_feature_utils.h:123
daal::Base
Base class for Intel(R) Data Analytics Acceleration Library objects
Definition: base.h:41
daal::services::ErrorNullByteInjection
Definition: error_indexes.h:394
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:39
daal::services::ErrorMemoryAllocationFailed
Definition: error_indexes.h:150