C++ API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

csv_feature_utils.h
1 /* file: csv_feature_utils.h */
2 /*******************************************************************************
3 * Copyright 2014-2020 Intel Corporation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef __CSV_FEATURE_UTILS_H__
19 #define __CSV_FEATURE_UTILS_H__
20 
21 #include <sstream>
22 
23 #include "services/collection.h"
24 #include "services/daal_string.h"
25 #include "data_management/features/defines.h"
26 
27 namespace daal
28 {
29 namespace data_management
30 {
31 namespace internal
32 {
33 
39 class CSVRowTokenizer : public Base
40 {
41 private:
42  char *_rawData;
43  const size_t _rawDataSize;
44  const char _delimiter;
45 
46  size_t _pos;
47  size_t _prevPos;
48  size_t _tokenSize;
49  bool _goodFlag;
50 
51 public:
52  explicit CSVRowTokenizer(char *rawData, size_t rawDataSize, char delimiter) :
53  _rawData(rawData),
54  _rawDataSize(rawDataSize),
55  _delimiter(delimiter),
56  _pos(0),
57  _prevPos(0),
58  _tokenSize(0),
59  _goodFlag(true) { }
60 
61  void reset()
62  {
63  _pos = 0;
64  _prevPos = 0;
65  _tokenSize = 0;
66  _goodFlag = true;
67 
68  next();
69  }
70 
71  DAAL_FORCEINLINE void next()
72  {
73  /* We assume _rawData is single line of CSV file and
74  * has a termination character in the end */
75 
76  if (!good()) { return; }
77 
78  _prevPos = _pos;
79 
80  while (isValidSymbol(_pos) && !isStopSymbol(_pos))
81  { _pos++; }
82 
83  _tokenSize = _pos - _prevPos;
84  _goodFlag = isValidSymbol(_prevPos);
85 
86  if (isValidSymbol(_pos) && isStopSymbol(_pos))
87  {
88  _rawData[_pos] = '\0';
89  _pos++;
90  }
91  }
92 
93  DAAL_FORCEINLINE bool good() const
94  {
95  return _goodFlag;
96  }
97 
98  DAAL_FORCEINLINE services::StringView getCurrentToken() const
99  {
100  return services::StringView(_rawData + _prevPos, _tokenSize);
101  }
102 
103 private:
104  DAAL_FORCEINLINE bool isValidSymbol(size_t index) const
105  {
106  return index < _rawDataSize &&
107  _rawData[index] != '\0';
108  }
109 
110  DAAL_FORCEINLINE bool isStopSymbol(size_t index) const
111  {
112  return _rawData[index] == _delimiter;
113  }
114 
115  CSVRowTokenizer(const CSVRowTokenizer &);
116  CSVRowTokenizer &operator=(const CSVRowTokenizer &);
117 };
118 
123 class CSVFeaturesInfo : public Base
124 {
125 public:
126  services::Status addFeatureName(const services::StringView &featureName)
127  {
128  services::Status status = services::internal::checkForNullByteInjection(featureName.begin(), featureName.end());
129  if (!status)
130  {
131  return services::throwIfPossible(services::ErrorNullByteInjection);
132  }
133  const services::String featureNameStr(featureName.begin());
134  if ( !_featureNames.safe_push_back(featureNameStr) )
135  {
136  return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
137  }
138  return services::Status();
139  }
140 
141  services::Status addFeatureType(const services::StringView &token)
142  {
143  const features::FeatureType featureType = detectFeatureType(token);
144  if ( !_featureTypes.safe_push_back(featureType) )
145  {
146  return services::throwIfPossible(services::ErrorMemoryAllocationFailed);
147  }
148  return services::Status();
149  }
150 
151  size_t getNumberOfFeatures() const
152  {
153  /* We allow _featureNames to be empty to support a no-header case */
154  if (_featureNames.size() != 0)
155  {
156  DAAL_ASSERT( _featureNames.size() == _featureTypes.size() );
157  return _featureNames.size();
158  }
159  return _featureTypes.size();
160  }
161 
162  const services::String &getFeatureName(size_t featureIndex) const
163  {
164  DAAL_ASSERT( _featureNames.size() == 0 ||
165  _featureNames.size() == _featureTypes.size() );
166  DAAL_ASSERT( featureIndex < _featureNames.size() );
167  return _featureNames[featureIndex];
168  }
169 
170  features::FeatureType getDetectedFeatureType(size_t featureIndex) const
171  {
172  DAAL_ASSERT( featureIndex < _featureTypes.size() );
173  return _featureTypes[featureIndex];
174  }
175 
176  bool areFeatureNamesAvailable() const
177  {
178  return _featureNames.size() > 0;
179  }
180 
181 private:
182  static features::FeatureType detectFeatureType(const services::StringView &token)
183  {
184  return isNumericalFeature(token)
185  ? features::DAAL_CONTINUOUS
186  : features::DAAL_CATEGORICAL;
187  }
188 
189  static bool isNumericalFeature(const services::StringView &token)
190  {
191  std::istringstream iss(token.c_str());
192  DAAL_DATA_TYPE f = 0.0; iss >> f;
193  return !(iss.fail());
194  }
195 
196 private:
197  services::Collection<services::String> _featureNames;
198  services::Collection<features::FeatureType> _featureTypes;
199 };
200 
201 } // namespace internal
202 } // namespace data_management
203 } // namespace daal
204 
205 #endif
daal::data_management::internal::CSVFeaturesInfo
Class that holds auxiliary information about features being parsed.
Definition: csv_feature_utils.h:123
daal::Base
Base class for Intel(R) Data Analytics Acceleration Library objects
Definition: base.h:41
daal::services::ErrorNullByteInjection
Definition: error_indexes.h:394
daal::data_management::internal::CSVRowTokenizer
Class that parses single row in CSV file and implements iterator-like interface to iterate over the p...
Definition: csv_feature_utils.h:39
daal::services::ErrorMemoryAllocationFailed
Definition: error_indexes.h:150

For more complete information about compiler optimizations, see our Optimization Notice.