Add fixed width file parser based on boost::tokenize

Effective c++ notes:
- explicitly declare special member functions
- explicitly declare overriding functions as override
This commit is contained in:
Geert Janssens 2016-01-28 12:30:29 +01:00 committed by Geert Janssens
parent efcd266971
commit 48cfbc23d4
7 changed files with 125 additions and 0 deletions

View File

@ -438,6 +438,7 @@ src/import-export/csv-imp/gnc-csv-gnumeric-popup.c
src/import-export/csv-imp/gnc-csv-model.c
src/import-export/csv-imp/gnc-csv-tokenizer.cpp
src/import-export/csv-imp/gnc-csv-trans-settings.c
src/import-export/csv-imp/gnc-fw-tokenizer.cpp
src/import-export/csv-imp/gncmod-csv-import.c
src/import-export/csv-imp/gnc-plugin-csv-import.c
src/import-export/csv-imp/gnc-tokenizer.cpp

View File

@ -14,6 +14,7 @@ SET(csv_import_SOURCES
gnc-csv-gnumeric-popup.c
gnc-csv-tokenizer.cpp
gnc-csv-trans-settings.c
gnc-fw-tokenizer.cpp
gnc-tokenizer.cpp
${CMAKE_SOURCE_DIR}/lib/stf/stf-parse.c
${CMAKE_SOURCE_DIR}/lib/goffice/go-charmap-sel.c
@ -36,6 +37,7 @@ SET(csv_import_noinst_HEADERS
gnc-csv-gnumeric-popup.h
gnc-csv-tokenizer.hpp
gnc-csv-trans-settings.h
gnc-fw-tokenizer.hpp
gnc-tokenizer.hpp
${CMAKE_SOURCE_DIR}/lib/stf/stf-parse.h
${CMAKE_SOURCE_DIR}/lib/goffice/go-charmap-sel.h

View File

@ -14,6 +14,7 @@ libgncmod_csv_import_la_SOURCES = \
gnc-csv-model.c \
gnc-csv-tokenizer.cpp \
gnc-csv-gnumeric-popup.c \
gnc-fw-tokenizer.cpp \
gnc-tokenizer.cpp \
gnc-csv-trans-settings.c
@ -28,6 +29,7 @@ noinst_HEADERS = \
gnc-csv-model.h \
gnc-csv-tokenizer.hpp \
gnc-csv-gnumeric-popup.h \
gnc-fw-tokenizer.hpp \
gnc-tokenizer.hpp \
gnc-csv-trans-settings.h

View File

@ -0,0 +1,48 @@
#include "gnc-fw-tokenizer.hpp"
#include <iostream>
#include <fstream> // fstream
#include <vector>
#include <string>
#include <algorithm> // copy
#include <iterator> // ostream_operator
#include <boost/tokenizer.hpp>
#include <boost/locale.hpp>
void
GncFwTokenizer::columns(const std::vector<uint>& cols)
{
col_vec = cols;
}
int GncFwTokenizer::tokenize()
{
typedef boost::tokenizer< boost::offset_separator > Tokenizer;
boost::offset_separator sep(col_vec.begin(), col_vec.end(), false);
std::vector<std::string> vec;
std::string line;
std::string buffer;
tokenized_contents.clear();
std::istringstream in_stream(utf8_contents);
while (std::getline (in_stream, line))
{
Tokenizer tok(line, sep);
vec.assign(tok.begin(),tok.end());
line.clear(); // clear here, next check could fail
// example checking
// for correctly parsed 3 fields per record
if (vec.size() < 3) continue;
tokenized_contents.push_back(vec);
}
return 0;
}

View File

@ -0,0 +1,67 @@
/********************************************************************\
* gnc-fw-tokenizer.hpp - takes a file and converts it into a *
* two-dimensional vector of strings (table) *
* splitting the contents on fixed width *
* positions *
* *
* This program is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public License as *
* published by the Free Software Foundation; either version 2 of *
* the License, or (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License*
* along with this program; if not, contact: *
* *
* Free Software Foundation Voice: +1-617-542-5942 *
* 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
* Boston, MA 02110-1301, USA gnu@gnu.org *
\********************************************************************/
/** @file
@brief Class convert a file with fixed with delimited contents
into vector of string vectors.
One can define the widths of each column to use to split each line
into multiple fields.
However, no gnucash specific interpretation is done yet, that's up
to the code using this class.
*
gnc-fw-tokenizer.hpp
@author Copyright (c) 2015 Geert Janssens <geert@kobaltwit.be>
*/
#ifndef GNC_FW_TOKENIZER_HPP
#define GNC_FW_TOKENIZER_HPP
extern "C" {
#include "config.h"
}
#include <iostream>
#include <fstream> // fstream
#include <vector>
#include <string>
#include "gnc-tokenizer.hpp"
class GncFwTokenizer : public GncTokenizer
{
public:
GncFwTokenizer() = default; // default constructor
GncFwTokenizer(const GncFwTokenizer&) = default; // copy constructor
GncFwTokenizer& operator=(const GncFwTokenizer&) = default; // copy assignment
GncFwTokenizer(GncFwTokenizer&&) = default; // move constructor
GncFwTokenizer& operator=(GncFwTokenizer&&) = default; // move assignment
~GncFwTokenizer() = default; // destructor
void columns(const std::vector<uint>& cols);
int tokenize() override;
private:
std::vector<uint> col_vec;
};
#endif

View File

@ -1,5 +1,6 @@
#include "gnc-tokenizer.hpp"
#include "gnc-csv-tokenizer.hpp"
#include "gnc-fw-tokenizer.hpp"
#include <iostream>
#include <fstream> // fstream
@ -23,6 +24,9 @@ std::unique_ptr<GncTokenizer> GncTokenizerFactory(GncImpFileFormat fmt)
case GncImpFileFormat::CSV:
tok.reset(new GncCsvTokenizer());
break;
case GncImpFileFormat::FIXED_WIDTH:
tok.reset(new GncFwTokenizer());
break;
default:
break;
}

View File

@ -51,6 +51,7 @@ using str_vec = std::vector<std::string>;
enum class GncImpFileFormat {
UNKNOWN,
CSV,
FIXED_WIDTH
};