Add csv file parser based on boost::tokenize

Effective c++ notes:
- explicitly declare special member functions
- explicitly declare overriding functions as override
This commit is contained in:
Geert Janssens 2016-01-26 09:48:29 +01:00 committed by Geert Janssens
parent ed7b863d8f
commit efcd266971
7 changed files with 145 additions and 0 deletions

View File

@ -436,6 +436,7 @@ src/import-export/csv-imp/csv-fixed-trans-import.c
src/import-export/csv-imp/gnc-csv-account-map.c
src/import-export/csv-imp/gnc-csv-gnumeric-popup.c
src/import-export/csv-imp/gnc-csv-model.c
src/import-export/csv-imp/gnc-csv-tokenizer.cpp
src/import-export/csv-imp/gnc-csv-trans-settings.c
src/import-export/csv-imp/gncmod-csv-import.c
src/import-export/csv-imp/gnc-plugin-csv-import.c

View File

@ -12,6 +12,7 @@ SET(csv_import_SOURCES
gnc-csv-account-map.c
gnc-csv-model.c
gnc-csv-gnumeric-popup.c
gnc-csv-tokenizer.cpp
gnc-csv-trans-settings.c
gnc-tokenizer.cpp
${CMAKE_SOURCE_DIR}/lib/stf/stf-parse.c
@ -33,6 +34,7 @@ SET(csv_import_noinst_HEADERS
gnc-csv-account-map.h
gnc-csv-model.h
gnc-csv-gnumeric-popup.h
gnc-csv-tokenizer.hpp
gnc-csv-trans-settings.h
gnc-tokenizer.hpp
${CMAKE_SOURCE_DIR}/lib/stf/stf-parse.h

View File

@ -12,6 +12,7 @@ libgncmod_csv_import_la_SOURCES = \
csv-fixed-trans-import.c \
gnc-csv-account-map.c \
gnc-csv-model.c \
gnc-csv-tokenizer.cpp \
gnc-csv-gnumeric-popup.c \
gnc-tokenizer.cpp \
gnc-csv-trans-settings.c
@ -25,6 +26,7 @@ noinst_HEADERS = \
csv-fixed-trans-import.h \
gnc-csv-account-map.h \
gnc-csv-model.h \
gnc-csv-tokenizer.hpp \
gnc-csv-gnumeric-popup.h \
gnc-tokenizer.hpp \
gnc-csv-trans-settings.h

View File

@ -0,0 +1,71 @@
#include "gnc-csv-tokenizer.hpp"
#include <iostream>
#include <fstream> // fstream
#include <vector>
#include <string>
#include <algorithm> // copy
#include <iterator> // ostream_operator
#include <boost/tokenizer.hpp>
#include <boost/locale.hpp>
void
GncCsvTokenizer::set_separators(const std::string& separators)
{
sep_str = separators;
}
int GncCsvTokenizer::tokenize()
{
typedef boost::tokenizer< boost::escaped_list_separator<char> > Tokenizer;
boost::escaped_list_separator<char> sep("\\", sep_str, "\"");
std::vector<std::string> vec;
std::string line;
std::string buffer;
bool inside_quotes(false);
size_t last_quote(0);
tokenized_contents.clear();
std::istringstream in_stream(utf8_contents);
while (std::getline (in_stream, buffer))
{
// --- deal with line breaks in quoted strings
last_quote = buffer.find_first_of('"');
while (last_quote != std::string::npos)
{
if (last_quote == 0) // Test separately because last_quote - 1 would be out of range
inside_quotes = !inside_quotes;
else if (buffer[ last_quote - 1 ] != '\\')
inside_quotes = !inside_quotes;
last_quote = buffer.find_first_of('"',last_quote+1);
}
line.append(buffer);
if (inside_quotes)
{
line.append("\n");
continue;
}
// ---
Tokenizer tok(line, sep);
vec.assign(tok.begin(),tok.end());
line.clear(); // clear here, next check could fail
// example checking
// for correctly parsed 3 fields per record
if (vec.size() < 3) continue;
tokenized_contents.push_back(vec);
}
return 0;
}

View File

@ -0,0 +1,64 @@
/********************************************************************\
* gnc-csv-tokenizer.hpp - takes a csv file and converts it into a *
* two-dimensional vector of strings (table)*
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License as *
* published by the Free Software Foundation; either version 2 of *
* the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License*
* along with this program; if not, contact: *
* *
* Free Software Foundation Voice: +1-617-542-5942 *
* 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
* Boston, MA 02110-1301, USA gnu@gnu.org *
\********************************************************************/
/** @file
@brief Class to convert a csv file into vector of string vectors.
One can define the separator characters to use to split each line
into multiple fields. Quote characters will be removed.
However, no gnucash specific interpretation is done yet, that's up
to the code using this class.
*
gnc-csv-tokenizer.hpp
@author Copyright (c) 2015 Geert Janssens <geert@kobaltwit.be>
*/
#ifndef GNC_CSV_TOKENIZER_HPP
#define GNC_CSV_TOKENIZER_HPP
extern "C" {
#include "config.h"
}
#include <iostream>
#include <fstream> // fstream
#include <vector>
#include <string>
#include "gnc-tokenizer.hpp"
class GncCsvTokenizer : public GncTokenizer
{
public:
GncCsvTokenizer() = default; // default constructor
GncCsvTokenizer(const GncCsvTokenizer&) = default; // copy constructor
GncCsvTokenizer& operator=(const GncCsvTokenizer&) = default; // copy assignment
GncCsvTokenizer(GncCsvTokenizer&&) = default; // move constructor
GncCsvTokenizer& operator=(GncCsvTokenizer&&) = default; // move assignment
~GncCsvTokenizer() = default; // destructor
void set_separators(const std::string& separators);
int tokenize() override;
private:
std::string sep_str = ",";
};
#endif

View File

@ -1,4 +1,5 @@
#include "gnc-tokenizer.hpp"
#include "gnc-csv-tokenizer.hpp"
#include <iostream>
#include <fstream> // fstream
@ -19,6 +20,9 @@ std::unique_ptr<GncTokenizer> GncTokenizerFactory(GncImpFileFormat fmt)
std::unique_ptr<GncTokenizer> tok(nullptr);
switch (fmt)
{
case GncImpFileFormat::CSV:
tok.reset(new GncCsvTokenizer());
break;
default:
break;
}

View File

@ -50,6 +50,7 @@ using str_vec = std::vector<std::string>;
/** Enumeration for file formats supported by this importer. */
enum class GncImpFileFormat {
UNKNOWN,
CSV,
};