gnucash/lib/stf/stf-parse.c
Andreas Köhler a6839478a4 Merge csv-import branch back into trunk.
configure.in                                       |    2
 lib/Makefile.am                                    |    4
 lib/stf/Makefile.am                                |   13
 lib/stf/README                                     |    2
 lib/stf/stf-parse.c                                | 1414 +++++++++++++++++++++
 lib/stf/stf-parse.h                                |  112 +
 src/bin/gnucash-bin.c                              |    1
 src/import-export/Makefile.am                      |    4
 src/import-export/csv/Makefile.am                  |   59
 src/import-export/csv/example-file.csv             |    4
 src/import-export/csv/gnc-csv-gnumeric-popup.c     |  194 ++
 src/import-export/csv/gnc-csv-gnumeric-popup.h     |   78 +
 src/import-export/csv/gnc-csv-import.c             | 1173 +++++++++++++++++
 src/import-export/csv/gnc-csv-import.h             |   33
 src/import-export/csv/gnc-csv-model.c              | 1199 +++++++++++++++++
 src/import-export/csv/gnc-csv-model.h              |  122 +
 src/import-export/csv/gnc-csv-preview-dialog.glade |  496 +++++++
 src/import-export/csv/gnc-csv2glist.c              |  187 --
 src/import-export/csv/gnc-csv2glist.h              |   39
 src/import-export/csv/gnc-plugin-csv-ui.xml        |   11
 src/import-export/csv/gnc-plugin-csv.c             |  160 ++
 src/import-export/csv/gnc-plugin-csv.h             |   60
 src/import-export/csv/gncmod-csv-import.c          |   91 +
 23 files changed, 5228 insertions(+), 230 deletions(-)


git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@16561 57a11ea4-9604-0410-9ed3-97b8803252fd
2007-10-12 22:51:34 +00:00

1415 lines
36 KiB
C

/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* stf-parse.c : Structured Text Format parser. (STF)
* A general purpose engine for parsing data
* in CSV and Fixed width format.
*
*
* Copyright (C) Almer. S. Tigelaar.
* EMail: almer1@dds.nl or almer-t@bigfoot.com
*
* Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
* Copyright (C) 2003 Morten Welinder <terra@gnome.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#define GETTEXT_PACKAGE gnumeric
#include <glib/gi18n-lib.h>
/* #include "gnumeric.h" */
#include "stf-parse.h"
/* #include "workbook.h" */
/* #include "cell.h" */
/* #include "sheet.h" */
/* #include "clipboard.h" */
/* #include "sheet-style.h" */
/* #include "value.h" */
/* #include "mstyle.h" */
/* #include "number-match.h" */
/* #include "gutils.h" */
/* #include "parse-util.h" */
#include <goffice/utils/go-glib-extras.h>
#include <goffice/utils/go-format.h>
#include <stdlib.h>
#include <stdio.h>
#include <locale.h>
#include <string.h>
#include <math.h>
#define SETUP_LOCALE_SWITCH char *oldlocale = NULL
#define START_LOCALE_SWITCH if (parseoptions->locale) {\
oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
go_setlocale(LC_ALL, parseoptions->locale);}
#define END_LOCALE_SWITCH if (oldlocale) {\
go_setlocale(LC_ALL, oldlocale);\
g_free (oldlocale);}
/* Source_t struct, used for interchanging parsing information between the low level parse functions */
typedef struct {
GStringChunk *chunk;
char const *position; /* Indicates the current position within data */
/* Used internally for fixed width parsing */
int splitpos; /* Indicates current position in splitpositions array */
int linepos; /* Position on the current line */
} Source_t;
/* Struct used for autodiscovery */
typedef struct {
int start;
int stop;
} AutoDiscovery_t;
/*
* Some silly dude make the length field an unsigned int. C just does
* not deal very well with that.
*/
static inline int
my_garray_len (GArray const *a)
{
return (int)a->len;
}
static inline int
my_gptrarray_len (GPtrArray const *a)
{
return (int)a->len;
}
static int
compare_terminator (char const *s, StfParseOptions_t *parseoptions)
{
guchar const *us = (guchar const *)s;
GSList *l;
if (*us > parseoptions->compiled_terminator.max ||
*us < parseoptions->compiled_terminator.min)
return 0;
for (l = parseoptions->terminator; l; l = l->next) {
char const *term = l->data;
char const *d = s;
while (*term) {
if (*d != *term)
goto next;
term++;
d++;
}
return d - s;
next:
;
}
return 0;
}
/*******************************************************************************************************
* STF PARSE OPTIONS : StfParseOptions related
*******************************************************************************************************/
/**
* stf_parse_options_new:
*
* This will return a new StfParseOptions_t struct.
* The struct should, after being used, freed with stf_parse_options_free.
**/
StfParseOptions_t *
stf_parse_options_new (void)
{
StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
parseoptions->parsetype = PARSE_TYPE_NOTSET;
parseoptions->terminator = NULL;
stf_parse_options_add_line_terminator (parseoptions, "\r\n");
stf_parse_options_add_line_terminator (parseoptions, "\n");
stf_parse_options_add_line_terminator (parseoptions, "\r");
parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
parseoptions->locale = NULL;
parseoptions->splitpositions = NULL;
stf_parse_options_fixed_splitpositions_clear (parseoptions);
parseoptions->stringindicator = '"';
parseoptions->indicator_2x_is_single = TRUE;
parseoptions->duplicates = FALSE;
parseoptions->trim_seps = FALSE;
parseoptions->sep.str = NULL;
parseoptions->sep.chr = NULL;
parseoptions->col_import_array = NULL;
parseoptions->col_import_array_len = 0;
parseoptions->formats = NULL;
parseoptions->cols_exceeded = FALSE;
return parseoptions;
}
/**
* stf_parse_options_free:
*
* will free @parseoptions, note that this will not free the splitpositions
* member (GArray) of the struct, the caller is responsible for that.
**/
void
stf_parse_options_free (StfParseOptions_t *parseoptions)
{
g_return_if_fail (parseoptions != NULL);
g_free (parseoptions->col_import_array);
g_free (parseoptions->locale);
g_free (parseoptions->sep.chr);
if (parseoptions->sep.str) {
GSList *l;
for (l = parseoptions->sep.str; l != NULL; l = l->next)
g_free ((char *) l->data);
g_slist_free (parseoptions->sep.str);
}
g_array_free (parseoptions->splitpositions, TRUE);
stf_parse_options_clear_line_terminator (parseoptions);
if (parseoptions->formats) {
unsigned int ui;
GPtrArray *formats = parseoptions->formats;
for (ui = 0; ui < formats->len; ui++)
go_format_unref (g_ptr_array_index (formats, ui));
g_ptr_array_free (formats, TRUE);
parseoptions->formats = NULL;
}
g_free (parseoptions);
}
void
stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
{
g_return_if_fail (parseoptions != NULL);
g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
parseoptions->parsetype = parsetype;
}
static gint
long_string_first (gchar const *a, gchar const *b)
{
/* This actually is UTF-8 safe. */
return strlen (b) - strlen (a);
}
static void
compile_terminators (StfParseOptions_t *parseoptions)
{
GSList *l;
GO_SLIST_SORT (parseoptions->terminator, (GCompareFunc)long_string_first);
parseoptions->compiled_terminator.min = 255;
parseoptions->compiled_terminator.max = 0;
for (l = parseoptions->terminator; l; l = l->next) {
const guchar *term = l->data;
parseoptions->compiled_terminator.min =
MIN (parseoptions->compiled_terminator.min, *term);
parseoptions->compiled_terminator.max =
MAX (parseoptions->compiled_terminator.max, *term);
}
}
/**
* stf_parse_options_add_line_terminator:
*
* This will add to the line terminators, in both the Fixed width and CSV delimited importers
* this indicates the end of a row.
*
**/
void
stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
{
g_return_if_fail (parseoptions != NULL);
g_return_if_fail (terminator != NULL && *terminator != 0);
GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
compile_terminators (parseoptions);
}
/**
* stf_parse_options_clear_line_terminator:
*
* This will clear the line terminator, in both the Fixed width and CSV delimited importers
* this indicates the end of a row.
*
**/
void
stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
{
g_return_if_fail (parseoptions != NULL);
go_slist_free_custom (parseoptions->terminator, g_free);
parseoptions->terminator = NULL;
compile_terminators (parseoptions);
}
/**
* stf_parse_options_set_trim_spaces:
*
* If enabled will trim spaces in every parsed field on left and/or right
* sides.
**/
void
stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
{
g_return_if_fail (parseoptions != NULL);
parseoptions->trim_spaces = trim_spaces;
}
/**
* stf_parse_options_csv_set_separators:
*
* A copy is made of the parameters.
**/
void
stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions, char const *character,
GSList const *string)
{
g_return_if_fail (parseoptions != NULL);
g_free (parseoptions->sep.chr);
parseoptions->sep.chr = g_strdup (character);
go_slist_free_custom (parseoptions->sep.str, g_free);
parseoptions->sep.str = go_slist_map (string, (GOMapFunc)g_strdup);
}
void
stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
{
g_return_if_fail (parseoptions != NULL);
g_return_if_fail (stringindicator != '\0');
parseoptions->stringindicator = stringindicator;
}
/**
* stf_parse_options_csv_set_indicator_2x_is_single:
* @indic_2x : a boolean value indicating whether we want to see two
* adjacent string indicators as a single string indicator
* that is part of the cell, rather than a terminator.
**/
void
stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
gboolean const indic_2x)
{
g_return_if_fail (parseoptions != NULL);
parseoptions->indicator_2x_is_single = indic_2x;
}
/**
* stf_parse_options_csv_set_duplicates:
* @duplicates : a boolean value indicating whether we want to see two
* separators right behind each other as one
**/
void
stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
{
g_return_if_fail (parseoptions != NULL);
parseoptions->duplicates = duplicates;
}
/**
* stf_parse_options_csv_set_trim_seps:
* @trim_seps : a boolean value indicating whether we want to ignore
* separators at the beginning of lines
**/
void
stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
{
g_return_if_fail (parseoptions != NULL);
parseoptions->trim_seps = trim_seps;
}
/**
* stf_parse_options_fixed_splitpositions_clear:
*
* This will clear the splitpositions (== points on which a line is split)
**/
void
stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
{
int minus_one = -1;
g_return_if_fail (parseoptions != NULL);
if (parseoptions->splitpositions)
g_array_free (parseoptions->splitpositions, TRUE);
parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
g_array_append_val (parseoptions->splitpositions, minus_one);
}
/**
* stf_parse_options_fixed_splitpositions_add:
*
* @position will be added to the splitpositions.
**/
void
stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
{
unsigned int ui;
g_return_if_fail (parseoptions != NULL);
g_return_if_fail (position >= 0);
for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
int here = g_array_index (parseoptions->splitpositions, int, ui);
if (position == here)
return;
if (position < here)
break;
}
g_array_insert_val (parseoptions->splitpositions, ui, position);
}
void
stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
{
unsigned int ui;
g_return_if_fail (parseoptions != NULL);
g_return_if_fail (position >= 0);
for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
int here = g_array_index (parseoptions->splitpositions, int, ui);
if (position == here)
g_array_remove_index (parseoptions->splitpositions, ui);
if (position <= here)
return;
}
}
int
stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
{
return parseoptions->splitpositions->len;
}
int
stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
{
return g_array_index (parseoptions->splitpositions, int, n);
}
/**
* stf_parse_options_valid:
* @parseoptions : an import options struct
*
* Checks if @parseoptions is correctly filled
*
* returns : TRUE if it is correctly filled, FALSE otherwise.
**/
static gboolean
stf_parse_options_valid (StfParseOptions_t *parseoptions)
{
g_return_val_if_fail (parseoptions != NULL, FALSE);
if (parseoptions->parsetype == PARSE_TYPE_CSV) {
if (parseoptions->stringindicator == '\0') {
g_warning ("STF: Cannot have \\0 as string indicator");
return FALSE;
}
} else if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
if (!parseoptions->splitpositions) {
g_warning ("STF: No splitpositions in struct");
return FALSE;
}
}
return TRUE;
}
/*******************************************************************************************************
* STF PARSE : The actual routines that do the 'trick'
*******************************************************************************************************/
static void
trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
{
if (!field) return;
if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
char *s = field;
while (g_unichar_isspace (g_utf8_get_char (s)))
s = g_utf8_next_char (s);
if (s != field)
strcpy (field, s);
}
if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
char *s = field + strlen (field);
while (field != s) {
s = g_utf8_prev_char (s);
if (!g_unichar_isspace (g_utf8_get_char (s)))
break;
*s = 0;
}
}
}
/**
* stf_parse_csv_is_separator:
*
* returns NULL if @character is not a separator, a pointer to the character
* after the separator otherwise.
**/
static char const *
stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
{
g_return_val_if_fail (character != NULL, NULL);
if (*character == 0)
return NULL;
if (str) {
GSList const *l;
for (l = str; l != NULL; l = l->next) {
char const *s = l->data;
char const *r;
glong cnt;
glong const len = g_utf8_strlen (s, -1);
/* Don't compare past the end of the buffer! */
for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
if (*r == '\0')
break;
if ((cnt == len) && (memcmp (character, s, len) == 0))
return g_utf8_offset_to_pointer (character, len);
}
}
if (chr && g_utf8_strchr (chr, -1,
g_utf8_get_char (character)))
return g_utf8_next_char(character);
return NULL;
}
/*
* stf_parse_eat_separators:
*
* skip over leading separators
*
*/
static void
stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
{
char const *cur, *next;
g_return_if_fail (src != NULL);
g_return_if_fail (parseoptions != NULL);
cur = src->position;
if (*cur == '\0' || compare_terminator (cur, parseoptions))
return;
while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
cur = next;
src->position = cur;
return;
}
typedef enum {
STF_CELL_ERROR,
STF_CELL_EOF,
STF_CELL_EOL,
STF_CELL_FIELD_NO_SEP,
STF_CELL_FIELD_SEP,
} StfParseCellRes;
static StfParseCellRes
stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
{
char const *cur;
gboolean saw_sep = FALSE;
g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
cur = src->position;
g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
/* Skip whitespace, but stop at line terminators. */
while (1) {
int term_len;
if (*cur == 0) {
src->position = cur;
return STF_CELL_EOF;
}
term_len = compare_terminator (cur, parseoptions);
if (term_len) {
src->position = cur + term_len;
return STF_CELL_EOL;
}
if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
break;
if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
parseoptions->sep.str))
break;
if (!g_unichar_isspace (g_utf8_get_char (cur)))
break;
cur = g_utf8_next_char (cur);
}
if (g_utf8_get_char (cur) == parseoptions->stringindicator) {
cur = g_utf8_next_char (cur);
while (*cur) {
gunichar uc = g_utf8_get_char (cur);
cur = g_utf8_next_char (cur);
if (uc == parseoptions->stringindicator) {
if (parseoptions->indicator_2x_is_single &&
g_utf8_get_char (cur) == parseoptions->stringindicator)
cur = g_utf8_next_char (cur);
else {
/* "field content"dropped-garbage, */
while (*cur && !compare_terminator (cur, parseoptions)) {
char const *post = stf_parse_csv_is_separator
(cur, parseoptions->sep.chr, parseoptions->sep.str);
if (post) {
cur = post;
saw_sep = TRUE;
break;
}
cur = g_utf8_next_char (cur);
}
break;
}
}
g_string_append_unichar (text, uc);
}
/* We silently allow a missing terminating quote. */
} else {
/* Unquoted field. */
while (*cur && !compare_terminator (cur, parseoptions)) {
char const *post = stf_parse_csv_is_separator
(cur, parseoptions->sep.chr, parseoptions->sep.str);
if (post) {
cur = post;
saw_sep = TRUE;
break;
}
g_string_append_unichar (text, g_utf8_get_char (cur));
cur = g_utf8_next_char (cur);
}
if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
while (text->len) {
const char *last = g_utf8_prev_char (text->str + text->len);
if (!g_unichar_isspace (g_utf8_get_char (last)))
break;
g_string_truncate (text, last - text->str);
}
}
}
src->position = cur;
if (saw_sep && parseoptions->duplicates)
stf_parse_eat_separators (src, parseoptions);
return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
}
/**
* stf_parse_csv_line:
*
* This will parse one line from the current @src->position.
* NOTE: The calling routine is responsible for freeing the result.
*
* returns : a GPtrArray of char*'s
**/
static GPtrArray *
stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
{
GPtrArray *line;
gboolean cont = FALSE;
g_return_val_if_fail (src != NULL, NULL);
g_return_val_if_fail (parseoptions != NULL, NULL);
line = g_ptr_array_new ();
if (parseoptions->trim_seps)
stf_parse_eat_separators (src, parseoptions);
while (1) {
GString *text = g_string_sized_new (30);
StfParseCellRes res =
stf_parse_csv_cell (text, src, parseoptions);
trim_spaces_inplace (text->str, parseoptions);
switch (res) {
case STF_CELL_FIELD_NO_SEP:
g_ptr_array_add (line, g_string_free (text, FALSE));
cont = FALSE;
break;
case STF_CELL_FIELD_SEP:
g_ptr_array_add (line, g_string_free (text, FALSE));
cont = TRUE; /* Make sure we see one more field. */
break;
default:
if (cont)
g_ptr_array_add (line, g_string_free (text, FALSE));
else
g_string_free (text, TRUE);
return line;
}
}
}
/**
* stf_parse_fixed_cell:
*
* returns a pointer to the parsed cell contents.
**/
static char *
stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
{
char *res;
char const *cur;
int splitval;
g_return_val_if_fail (src != NULL, NULL);
g_return_val_if_fail (parseoptions != NULL, NULL);
cur = src->position;
if (src->splitpos < my_garray_len (parseoptions->splitpositions))
splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
else
splitval = -1;
while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
src->linepos++;
cur = g_utf8_next_char (cur);
}
res = g_string_chunk_insert_len (src->chunk,
src->position,
cur - src->position);
src->position = cur;
return res;
}
/**
* stf_parse_fixed_line:
*
* This will parse one line from the current @src->position.
* It will return a GPtrArray with the cell contents as strings.
* NOTE: The calling routine is responsible for freeing result.
**/
static GPtrArray *
stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
{
GPtrArray *line;
g_return_val_if_fail (src != NULL, NULL);
g_return_val_if_fail (parseoptions != NULL, NULL);
src->linepos = 0;
src->splitpos = 0;
line = g_ptr_array_new ();
while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
char *field = stf_parse_fixed_cell (src, parseoptions);
trim_spaces_inplace (field, parseoptions);
g_ptr_array_add (line, field);
src->splitpos++;
}
return line;
}
void
stf_parse_general_free (GPtrArray *lines)
{
unsigned lineno;
for (lineno = 0; lineno < lines->len; lineno++) {
GPtrArray *line = g_ptr_array_index (lines, lineno);
/* Fields are not free here. */
g_ptr_array_free (line, TRUE);
}
g_ptr_array_free (lines, TRUE);
}
/**
* stf_parse_general:
*
* Returns a GPtrArray of lines, where each line is itself a
* GPtrArray of strings.
*
* The caller must free this entire structure, for example by calling
* stf_parse_general_free.
**/
GPtrArray *
stf_parse_general (StfParseOptions_t *parseoptions,
GStringChunk *lines_chunk,
char const *data, char const *data_end)
{
GPtrArray *lines;
Source_t src;
int row;
g_return_val_if_fail (parseoptions != NULL, NULL);
g_return_val_if_fail (data != NULL, NULL);
g_return_val_if_fail (data_end != NULL, NULL);
g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
g_return_val_if_fail (g_utf8_validate (data, -1, NULL), NULL);
src.chunk = lines_chunk;
src.position = data;
row = 0;
lines = g_ptr_array_new ();
while (*src.position != '\0' && src.position < data_end) {
GPtrArray *line;
line = parseoptions->parsetype == PARSE_TYPE_CSV
? stf_parse_csv_line (&src, parseoptions)
: stf_parse_fixed_line (&src, parseoptions);
g_ptr_array_add (lines, line);
if (parseoptions->parsetype != PARSE_TYPE_CSV)
src.position += compare_terminator (src.position, parseoptions);
if (++row == SHEET_MAX_ROWS)
break;
}
return lines;
}
GPtrArray *
stf_parse_lines (StfParseOptions_t *parseoptions,
GStringChunk *lines_chunk,
char const *data,
int maxlines, gboolean with_lineno)
{
GPtrArray *lines;
int lineno = 1;
g_return_val_if_fail (data != NULL, NULL);
lines = g_ptr_array_new ();
while (*data) {
char const *data0 = data;
GPtrArray *line = g_ptr_array_new ();
if (with_lineno) {
char buf[4 * sizeof (int)];
sprintf (buf, "%d", lineno);
g_ptr_array_add (line,
g_string_chunk_insert (lines_chunk, buf));
}
while (1) {
int termlen = compare_terminator (data, parseoptions);
if (termlen > 0 || *data == 0) {
g_ptr_array_add (line,
g_string_chunk_insert_len (lines_chunk,
data0,
data - data0));
data += termlen;
break;
} else
data = g_utf8_next_char (data);
}
g_ptr_array_add (lines, line);
lineno++;
if (lineno >= maxlines)
break;
}
return lines;
}
char const *
stf_parse_find_line (StfParseOptions_t *parseoptions,
char const *data,
int line)
{
while (line > 0) {
int termlen = compare_terminator (data, parseoptions);
if (termlen > 0) {
data += termlen;
line--;
} else if (*data == 0) {
return data;
} else {
data = g_utf8_next_char (data);
}
}
return data;
}
/**
* stf_parse_options_fixed_autodiscover:
* @parseoptions: a Parse options struct.
* @data_lines : The number of lines to look at in @data.
* @data : The actual data.
*
* Automatically try to discover columns in the text to be parsed.
* We ignore empty lines (only containing parseoptions->terminator)
*
* FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
* Think hard of a better more flexible solution...
**/
void
stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
char const *data, char const *data_end)
{
char const *iterator = data;
GSList *list = NULL;
GSList *list_start = NULL;
int lines = 0;
int effective_lines = 0;
int max_line_length = 0;
int *line_begin_hits = NULL;
int *line_end_hits = NULL;
int i;
stf_parse_options_fixed_splitpositions_clear (parseoptions);
/*
* First take a look at all possible white space combinations
*/
while (*iterator && iterator < data_end) {
gboolean begin_recorded = FALSE;
AutoDiscovery_t *disc = NULL;
int position = 0;
int termlen = 0;
while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
if (!begin_recorded && *iterator == ' ') {
disc = g_new0 (AutoDiscovery_t, 1);
disc->start = position;
begin_recorded = TRUE;
} else if (begin_recorded && *iterator != ' ') {
disc->stop = position;
list = g_slist_prepend (list, disc);
begin_recorded = FALSE;
disc = NULL;
}
position++;
iterator++;
}
if (position > max_line_length)
max_line_length = position;
/*
* If there are excess spaces at the end of
* the line : ignore them
*/
g_free (disc);
/*
* Hop over the terminator
*/
iterator += termlen;
if (position != 0)
effective_lines++;
lines++;
}
list = g_slist_reverse (list);
list_start = list;
/*
* Kewl stuff :
* Look at the number of hits at each line position
* if the number of hits equals the number of lines
* we can be pretty sure this is the start or end
* of a column, we filter out empty columns
* later
*/
line_begin_hits = g_new0 (int, max_line_length + 1);
line_end_hits = g_new0 (int, max_line_length + 1);
while (list) {
AutoDiscovery_t *disc = list->data;
line_begin_hits[disc->start]++;
line_end_hits[disc->stop]++;
g_free (disc);
list = g_slist_next (list);
}
g_slist_free (list_start);
for (i = 0; i < max_line_length + 1; i++)
if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
stf_parse_options_fixed_splitpositions_add (parseoptions, i);
/*
* Do some corrections to the initial columns
* detected here, we obviously don't need to
* do this if there are no columns at all.
*/
if (my_garray_len (parseoptions->splitpositions) > 0) {
/*
* Try to find columns that look like :
*
* Example 100
* Example2 9
*
* (In other words : Columns with left & right justification with
* a minimum of 2 spaces in the middle)
* Split these columns in 2
*/
for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
int begin = g_array_index (parseoptions->splitpositions, int, i);
int end = g_array_index (parseoptions->splitpositions, int, i + 1);
int num_spaces = -1;
int spaces_start = 0;
gboolean right_aligned = TRUE;
gboolean left_aligned = TRUE;
gboolean has_2_spaces = TRUE;
iterator = data;
lines = 0;
while (*iterator && iterator < data_end) {
gboolean trigger = FALSE;
gboolean space_trigger = FALSE;
int pos = 0;
num_spaces = -1;
spaces_start = 0;
while (*iterator && !compare_terminator (iterator, parseoptions)) {
if (pos == begin) {
if (*iterator == ' ')
left_aligned = FALSE;
trigger = TRUE;
} else if (pos == end - 1) {
if (*iterator == ' ')
right_aligned = FALSE;
trigger = FALSE;
}
if (trigger || pos == end - 1) {
if (!space_trigger && *iterator == ' ') {
space_trigger = TRUE;
spaces_start = pos;
} else if (space_trigger && *iterator != ' ') {
space_trigger = FALSE;
num_spaces = pos - spaces_start;
}
}
iterator++;
pos++;
}
if (num_spaces < 2)
has_2_spaces = FALSE;
if (*iterator)
iterator++;
lines++;
}
/*
* If this column meets all the criteria
* split it into two at the last measured
* spaces_start + num_spaces
*/
if (has_2_spaces && right_aligned && left_aligned) {
int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
g_array_insert_val (parseoptions->splitpositions, i + 1, val);
/*
* Skip over the inserted column
*/
i++;
}
}
/*
* Remove empty columns here if needed
*/
for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
int begin = g_array_index (parseoptions->splitpositions, int, i);
int end = g_array_index (parseoptions->splitpositions, int, i + 1);
gboolean only_spaces = TRUE;
iterator = data;
lines = 0;
while (*iterator && iterator < data_end) {
gboolean trigger = FALSE;
int pos = 0;
while (*iterator && !compare_terminator (iterator, parseoptions)) {
if (pos == begin)
trigger = TRUE;
else if (pos == end)
trigger = FALSE;
if (trigger) {
if (*iterator != ' ')
only_spaces = FALSE;
}
iterator++;
pos++;
}
if (*iterator)
iterator++;
lines++;
}
/*
* The column only contains spaces
* remove it
*/
if (only_spaces) {
g_array_remove_index (parseoptions->splitpositions, i);
/*
* We HAVE to make sure that the next column (end) also
* gets checked out. If we don't decrease "i" here, we
* will skip over it as the indexes shift down after
* the removal
*/
i--;
}
}
}
g_free (line_begin_hits);
g_free (line_end_hits);
}
/*******************************************************************************************************
* STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
* functions into something meaningful (== application specific)
*******************************************************************************************************/
/* gboolean */
/* stf_parse_sheet (StfParseOptions_t *parseoptions, */
/* char const *data, char const *data_end, */
/* Sheet *sheet, int start_col, int start_row) */
/* { */
/* int row, col; */
/* unsigned int lrow, lcol; */
/* GODateConventions const *date_conv; */
/* GStringChunk *lines_chunk; */
/* GPtrArray *lines, *line; */
/* SETUP_LOCALE_SWITCH; */
/* g_return_val_if_fail (parseoptions != NULL, FALSE); */
/* g_return_val_if_fail (data != NULL, FALSE); */
/* g_return_val_if_fail (IS_SHEET (sheet), FALSE); */
/* START_LOCALE_SWITCH; */
/* date_conv = workbook_date_conv (sheet->workbook); */
/* if (!data_end) */
/* data_end = data + strlen (data); */
/* lines_chunk = g_string_chunk_new (100 * 1024); */
/* lines = stf_parse_general (parseoptions, lines_chunk, data, data_end); */
/* if (lines == NULL) */
/* return FALSE; */
/* for (row = start_row, lrow = 0; lrow < lines->len ; row++, lrow++) { */
/* col = start_col; */
/* line = g_ptr_array_index (lines, lrow); */
/* for (lcol = 0; lcol < line->len; lcol++) */
/* if (parseoptions->col_import_array == NULL || */
/* parseoptions->col_import_array_len <= lcol || */
/* parseoptions->col_import_array[lcol]) { */
/* if (col >= SHEET_MAX_COLS) { */
/* if (!parseoptions->cols_exceeded) { */
/* g_warning (_("There are more columns of data than " */
/* "there is room for in the sheet. Extra " */
/* "columns will be ignored.")); */
/* parseoptions->cols_exceeded = TRUE; */
/* } */
/* } else { */
/* char const *text = g_ptr_array_index (line, lcol); */
/* if (text && *text) */
/* gnm_cell_set_text ( */
/* sheet_cell_fetch (sheet, col, row), */
/* text); */
/* } */
/* col++; */
/* } */
/* } */
/* stf_parse_general_free (lines); */
/* g_string_chunk_free (lines_chunk); */
/* END_LOCALE_SWITCH; */
/* return TRUE; */
/* } */
/* GnmCellRegion * */
/* stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end, */
/* Workbook const *wb) */
/* { */
/* static GODateConventions const default_conv = {FALSE}; */
/* GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv; */
/* GnmCellRegion *cr; */
/* unsigned int row, colhigh = 0; */
/* char *text; */
/* GStringChunk *lines_chunk; */
/* GPtrArray *lines; */
/* GnmCellCopy *cc; */
/* GOFormat *fmt; */
/* GnmValue *v; */
/* SETUP_LOCALE_SWITCH; */
/* g_return_val_if_fail (parseoptions != NULL, NULL); */
/* g_return_val_if_fail (data != NULL, NULL); */
/* START_LOCALE_SWITCH; */
/* cr = cellregion_new (NULL); */
/* if (!data_end) */
/* data_end = data + strlen (data); */
/* lines_chunk = g_string_chunk_new (100 * 1024); */
/* lines = stf_parse_general (parseoptions, lines_chunk, data, data_end); */
/* for (row = 0; row < lines->len; row++) { */
/* GPtrArray *line = g_ptr_array_index (lines, row); */
/* unsigned int col, targetcol = 0; */
/* for (col = 0; col < line->len; col++) { */
/* if (parseoptions->col_import_array == NULL || */
/* parseoptions->col_import_array_len <= col || */
/* parseoptions->col_import_array[col]) { */
/* if (NULL != (text = g_ptr_array_index (line, col))) { */
/* fmt = g_ptr_array_index ( */
/* parseoptions->formats, col); */
/* if (NULL == (v = format_match (text, fmt, date_conv))) */
/* v = value_new_string (text); */
/* cc = gnm_cell_copy_new (cr, targetcol, row); */
/* cc->val = v; */
/* cc->texpr = NULL; */
/* targetcol++; */
/* if (targetcol > colhigh) */
/* colhigh = targetcol; */
/* } */
/* } */
/* } */
/* } */
/* stf_parse_general_free (lines); */
/* g_string_chunk_free (lines_chunk); */
/* END_LOCALE_SWITCH; */
/* cr->cols = (colhigh > 0) ? colhigh : 1; */
/* cr->rows = row; */
/* return cr; */
/* } */
static int
int_sort (void const *a, void const *b)
{
return *(int const *)a - *(int const *)b;
}
static int
count_character (GPtrArray *lines, gunichar c, double quantile)
{
int *counts, res;
unsigned int lno, cno;
if (lines->len == 0)
return 0;
counts = g_new (int, lines->len);
for (lno = cno = 0; lno < lines->len; lno++) {
int count = 0;
GPtrArray *boxline = g_ptr_array_index (lines, lno);
char const *line = g_ptr_array_index (boxline, 0);
/* Ignore empty lines. */
if (*line == 0)
continue;
while (*line) {
if (g_utf8_get_char (line) == c)
count++;
line = g_utf8_next_char (line);
}
counts[cno++] = count;
}
if (cno == 0)
res = 0;
else {
unsigned int qi = (unsigned int)ceil (quantile * cno);
qsort (counts, cno, sizeof (counts[0]), int_sort);
if (qi == cno)
qi--;
res = counts[qi];
}
g_free (counts);
return res;
}
StfParseOptions_t *
stf_parse_options_guess (char const *data)
{
StfParseOptions_t *res;
GStringChunk *lines_chunk;
GPtrArray *lines;
int tabcount;
int sepcount;
/* TODO In the future, use the goffice 0.3. */
/* gunichar sepchar = go_locale_get_arg_sep (); */
gunichar sepchar = ',';
g_return_val_if_fail (data != NULL, NULL);
res = stf_parse_options_new ();
lines_chunk = g_string_chunk_new (100 * 1024);
lines = stf_parse_lines (res, lines_chunk, data, SHEET_MAX_ROWS, FALSE);
tabcount = count_character (lines, '\t', 0.2);
sepcount = count_character (lines, sepchar, 0.2);
/* At least one tab per line and enough to separate every
would-be sepchars. */
if (tabcount >= 1 && tabcount >= sepcount - 1)
stf_parse_options_csv_set_separators (res, "\t", NULL);
else {
gunichar c;
/*
* Try a few more or less likely characters and pick the first
* one that occurs on at least half the lines.
*
* The order is mostly random, although ' ' and '!' which
* could very easily occur in text are put last.
*/
/* TODO Replace with the 0.3 goffice call in the future. */
if (count_character (lines, (c = sepchar), 0.5) > 0 ||
/* count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 || */
count_character (lines, (c = ','), 0.5) > 0 ||
count_character (lines, (c = ':'), 0.5) > 0 ||
count_character (lines, (c = ','), 0.5) > 0 ||
count_character (lines, (c = ';'), 0.5) > 0 ||
count_character (lines, (c = '|'), 0.5) > 0 ||
count_character (lines, (c = '!'), 0.5) > 0 ||
count_character (lines, (c = ' '), 0.5) > 0) {
char sep[7];
sep[g_unichar_to_utf8 (c, sep)] = 0;
if (c == ' ')
strcat (sep, "\t");
stf_parse_options_csv_set_separators (res, sep, NULL);
}
}
if (1) {
/* Separated */
gboolean dups =
res->sep.chr &&
strchr (res->sep.chr, ' ') != NULL;
gboolean trim =
res->sep.chr &&
strchr (res->sep.chr, ' ') != NULL;
stf_parse_options_set_type (res, PARSE_TYPE_CSV);
stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
stf_parse_options_csv_set_duplicates (res, dups);
stf_parse_options_csv_set_trim_seps (res, trim);
stf_parse_options_csv_set_stringindicator (res, '"');
} else {
/* Fixed-width */
}
stf_parse_general_free (lines);
g_string_chunk_free (lines_chunk);
return res;
}