Files
cantera/tools/testtools/csvdiff.cpp

838 lines
23 KiB
C++

/*====================================================================
* ------------------------
* | CVS File Information |
* ------------------------
* $RCSfile$
* $Author$
* $Date$
* $Revision$
* $Name$
*====================================================================*/
/*
*
* csvdiff File1.csv File2.csv
*
* Compares the variable values in two Excel formatted
* comma separated files.
* The comparison is done using a weighted norm basis.
*
* The two files should be basically equal. However, File1.csv is
* taken as the reference file, that has precedence, when there is
* something to be decided upon.
*
* Arguments:
* -h = prints this usage information
*
* Shell Return Values
* 1 = Comparison was successful
* 0 = One or more nodal values failed the comparison
* -1 = Apples to oranges, the files can not even be compared against
* one another.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include "../../config.h"
#ifndef WINMSVC
#include <unistd.h>
#else
#include <string>
#endif
using namespace std;
#if defined(__CYGWIN__)
#include <getopt.h>
#endif
#include "mdp_allo.h"
#include "tok_input_util.h"
#ifndef TRUE
# define TRUE 1
# define FALSE 0
#endif
#ifndef MAX
# define MAX(x,y) (( (x) > (y) ) ? (x) : (y))
#endif
#ifndef MIN
# define MIN(x,y) (( (x) < (y) ) ? (x) : (y))
#endif
int Debug_Flag = TRUE;
double grtol = 1.0E-3;
double gatol = 1.0E-9;
/*
* First iteration towards getting this variable
*/
int Max_Input_Str_Ln = MAX_INPUT_STR_LN;
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
#ifdef WINMSVC
/*
* Windows doesn't have getopt(). This is an incomplete version that
* does enough to handle required functionality.
*/
int optind = -1;
int getopt(int argc, char **argv, const char *) {
static int currArg = 1;
string tok;
static int charPos = 0;
int rc = -1;
if (currArg >= argc) {
return -rc;
}
tok = string(argv[currArg]);
size_t len = strlen(tok.c_str());
if (charPos == 0) {
bool found = false;
do {
tok = string(argv[currArg]);
len = strlen(tok.c_str());
if (len > 1 && tok[0] == '-') {
found = true;
charPos = 1;
if (len > 2 && tok[1] == '-') {
charPos = 2;
}
} else {
if (optind == -1) {
optind = currArg;
}
}
if (!found) {
if (currArg < (argc-1)) {
currArg++;
} else {
return -1;
}
}
} while (!found);
}
rc = tok[charPos];
if (charPos < static_cast<int>(len - 1)) {
charPos++;
} else {
charPos = 0;
}
return rc;
}
#endif
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static int diff_double(double d1, double d2, double rtol, double atol)
/*
* Compares 2 doubles. If they are not within tolerance, then this
* function returns true.
*/
{
if (fabs(d1-d2) > (atol + rtol * 0.5 * (fabs(d1) + fabs(d2)))) return 1;
return 0;
}
static int diff_double_slope(double d1, double d2, double rtol,
double atol, double xtol, double slope1, double slope2)
/*
* Compares 2 doubles. If they are not within tolerance, then this
* function returns true.
*/
{
double atol2 = xtol*(fabs(slope1) + fabs(slope2));
if (fabs(d1-d2) > (atol + atol2 + rtol * 0.5 * (fabs(d1) + fabs(d2)))) return 1;
return 0;
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static double calc_rdiff(double d1, double d2, double rtol, double atol)
/*
* Calculates the relative difference using a fuzzy comparison
*/
{
double rhs, lhs;
rhs = fabs(d1-d2);
lhs = atol + rtol * 0.5 * (fabs(d1) + fabs(d2));
return (rhs/lhs);
}
/*****************************************************************************/
/*
* breakStrCommas():
* This routine will break a character string into stringlets according
* to the placement of commas. The commas are replaced by null
* characters.
*
* Argument:
* str => original string. On exit, this string will have beent
* altered.
* strlets -> Vector of pointers to char *. The vector has a size
* larger than or equal to maxPieces.
* maxPieces -> largest number of pieces to divide the string into.
*
* Return:
* This returns the number of pieces that the string is actually
* broken up into.
*/
static int breakStrCommas(char *str, char **strlets, int maxPieces)
{
int numbreaks = 0;
if (strlets) {
strlets[0] = str;
if (str) {
char *cptr = str;
char *cetn = NULL;
do {
cetn = strchr(cptr, (int) ',');
if (cetn) {
numbreaks++;
cptr = cetn + 1;
strlets[numbreaks] = cptr;
*cetn = '\0';
}
} while (cetn && (numbreaks < (maxPieces - 1)));
}
}
return numbreaks + 1;
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
#define LT_NULLLINE 0
#define LT_TITLELINE 1
#define LT_COLTITLE 2
#define LT_DATALINE 3
/*
* get_sizes()
*
* This routine obtains the sizes of the various elements of the file
* by parsing the file.
* (HKM: Note, this file could use some work. However, it's always
* going to be heuristic)
*
* Arguments:
*
* fp = File pointer
* nTitleLines = Number of title lines
* nColTitleLines = Number of column title lines
* nCol = Number of columns -> basically equal to the
* number of variabless
* nDataRows = Number of rows of data in the file
*
*/
static void get_sizes(FILE *fp, int &nTitleLines, int &nColTitleLines,
int &nCol, int &nDataRows)
{
int nScanLinesMAX = 100;
int nScanLines = nScanLinesMAX;
int retn, i, j;
int nLines = 0;
int maxCommas = 0;
TOKEN fieldToken;
char *scanLine = mdp_alloc_char_1(MAX_INPUT_STR_LN+1, '\0');
int *numCommas = mdp_alloc_int_1(nScanLinesMAX, -1);
/*
* Rewind the file
*/
rewind(fp);
/*
* Read the scan lines
*/
for (i = 0; i < nScanLinesMAX; i++) {
retn = read_line(fp, scanLine, 0);
if (retn == -1) {
nScanLines = i;
break;
}
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
/*
* Count the number of commas in the line
*/
char *cptr = scanLine;
char *cetn = NULL;
numCommas[i] = 0;
do {
cetn = strchr(cptr, (int) ',');
if (cetn) {
numCommas[i]++;
cptr = cetn + 1;
}
} while (cetn);
if (i > 1) {
if ( maxCommas < numCommas[i]) maxCommas = numCommas[i];
}
}
/*
* set a preliminary value of nCol
*/
nCol = maxCommas + 1;
char **strlets = (char **) mdp_alloc_ptr_1(maxCommas+1);
int doingLineType = LT_TITLELINE;
rewind(fp);
for (i = 0; i < nScanLines; i++) {
retn = read_line(fp, scanLine, 0);
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
int ncolsFound = breakStrCommas(scanLine, strlets, nCol);
if (doingLineType == LT_TITLELINE) {
if (numCommas[i] == maxCommas) {
doingLineType = LT_COLTITLE;
nTitleLines = i;
}
}
if (doingLineType == LT_COLTITLE) {
int goodDataLine = TRUE;
int rerr = FALSE;
for (j = 0; j < ncolsFound; j++) {
char *fieldStr = strlets[j];
fillTokStruct(&fieldToken, fieldStr);
if (fieldToken.ntokes != 1) {
goodDataLine = FALSE;
break;
}
(void) tok_to_double(&fieldToken, DBL_MAX,
-DBL_MAX, 0.0, &rerr);
if (rerr) {
goodDataLine = FALSE;
break;
}
}
if (goodDataLine) {
doingLineType = LT_DATALINE;
}
nColTitleLines = i - nTitleLines;
}
if (doingLineType == LT_DATALINE) break;
}
/*
* Count the total number of lines in the file
*/
if (doingLineType == LT_DATALINE) {
for (i = nColTitleLines + nTitleLines; ; i++) {
retn = read_line(fp, scanLine, 0);
if (retn == -1) {
nLines = i+1;
nDataRows = i - nColTitleLines - nTitleLines + 1;
break;
}
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
int ncolsFound = breakStrCommas(scanLine, strlets, nCol);
int goodDataLine = TRUE;
int rerr = FALSE;
for (j = 0; j < ncolsFound; j++) {
char *fieldStr = strlets[j];
fillTokStruct(&fieldToken, fieldStr);
if (fieldToken.ntokes != 1) {
goodDataLine = FALSE;
break;
}
(void) tok_to_double(&fieldToken, DBL_MAX,
-DBL_MAX, 0.0, &rerr);
if (rerr) {
goodDataLine = FALSE;
break;
}
}
if (! goodDataLine) {
doingLineType = LT_NULLLINE;
nDataRows = i - nColTitleLines - nTitleLines + 1;
break;
}
}
}
mdp_safe_free((void **) &strlets);
mdp_safe_free((void **) &scanLine);
mdp_safe_free((void **) &numCommas);
return;
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static void
read_title(FILE *fp, char ***title, int nTitleLines)
{
int retn;
*title = (char **) mdp_alloc_ptr_1(nTitleLines);
char *scanLine = mdp_alloc_char_1(Max_Input_Str_Ln + 1, '\0');
for (int i = 0; i < nTitleLines ; i++) {
retn = read_line(fp, scanLine, 0);
if (retn >= 0) {
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
*title[i] = mdp_copy_string(scanLine);
}
}
mdp_safe_free((void **) &scanLine);
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static void
read_colTitle(FILE *fp, char ****ColMLNames_ptr, int nColTitleLines, int nCol)
{
int retn, j;
*ColMLNames_ptr = (char ***) mdp_alloc_ptr_1(nCol);
char ***ColMLNames = *ColMLNames_ptr;
char *scanLine = mdp_alloc_char_1(Max_Input_Str_Ln + 1, '\0');
char **strlets = (char **) mdp_alloc_ptr_1(nCol+1);
for (int i = 0; i < nColTitleLines ; i++) {
retn = read_line(fp, scanLine, 0);
if (retn >= 0) {
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
int ncolsFound = breakStrCommas(scanLine, strlets, nCol);
ColMLNames[i] = mdp_alloc_VecFixedStrings(nCol, MAX_TOKEN_STR_LN+1);
for (j = 0; j < ncolsFound; j++) {
strip(strlets[j]);
strcpy(ColMLNames[i][j], strlets[j]);
}
}
}
mdp_safe_free((void **) &scanLine);
mdp_safe_free((void **) &strlets);
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static double get_atol(const double *values, const int nvals,
const double atol)
{
int i;
double sum = 0.0, retn;
if (nvals <= 0) return gatol;
for (i = 0; i < nvals; i++) {
retn = values[i];
sum += retn * retn;
}
sum /= nvals;
retn = sqrt(sum);
return ((retn + 1.0) * atol);
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static void
read_values(FILE *fp, double **NVValues, int nCol, int nDataRows)
{
char **strlets = (char **) mdp_alloc_ptr_1(nCol+1);
char *scanLine = mdp_alloc_char_1(Max_Input_Str_Ln + 1, '\0');
TOKEN fieldToken;
double value;
int retn, j;
for (int i = 0; i < nDataRows; i++) {
retn = read_line(fp, scanLine, 0);
if (retn == -1) {
break;
}
/*
* Strip a trailing comma from the scanline -
* -> These are not significant
*/
int ccount = static_cast<int>(strlen(scanLine));
if (ccount > 0) {
if (scanLine[ccount-1] == ',') scanLine[ccount-1] = '\0';
}
int ncolsFound = breakStrCommas(scanLine, strlets, nCol);
int goodDataLine = TRUE;
int rerr = FALSE;
for (j = 0; j < ncolsFound; j++) {
char *fieldStr = strlets[j];
fillTokStruct(&fieldToken, fieldStr);
if (fieldToken.ntokes != 1) {
goodDataLine = FALSE;
break;
}
value = tok_to_double(&fieldToken, DBL_MAX,
-DBL_MAX, 0.0, &rerr);
if (rerr) {
goodDataLine = FALSE;
break;
}
NVValues[j][i] = value;
}
if (! goodDataLine) {
break;
}
}
mdp_safe_free((void **) &strlets);
mdp_safe_free((void **) &scanLine);
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
static void print_usage() {
printf("\t\n");
printf(" csvdiff [-h] File1.csv File2.csv\n");
printf("\t\n");
printf("\tCompares the variable values in two Excel formatted "
"comma separated files.\n");
printf("\tThe comparison is done using a weighted norm basis.\n");
printf("\t\n");
printf("\tThe two files should be basically equal. However, File1.csv is\n");
printf("\ttaken as the reference file that has precedence, when there is\n");
printf("\tsomething to be decided upon.\n");
printf("\t\n");
printf("\t Arguments:\n");
printf("\t -h = Usage info\n");
printf("\t\n");
printf("\t Shell Return Values:\n");
printf("\t 1 = Comparison was successful\n");
printf("\t 0 = One or more nodal values failed the comparison\n");
printf("\t -1 = Apples to oranges, the files can not even be compared against\n");
printf("\t one another.\n");
printf("\t\n");
}
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
int main(int argc, char *argv[])
/*
* main driver for csvdiff.
*/
{
int opt_let;
char *fileName1=NULL, *fileName2=NULL; /* Names of the csv files */
FILE *fp1=NULL, *fp2=NULL;
int nTitleLines1, nTitleLines2;
int nColTitleLines1, nColTitleLines2;
int nCol1, nCol2, nColMAX, nColcomparisons;
int nDataRows1, nDataRows2;
char **title1, **title2;
int **compColList = NULL;
char ***ColMLNames1 = NULL, ***ColMLNames2 = NULL;
char **ColNames1 = NULL, **ColNames2 = NULL;
double **NVValues1 = NULL, **NVValues2 = NULL;
double *curVarValues1, *curVarValues2;
int mixed_var = 0;
int i, j, ndiff, jmax, i1, i2, k, found;
double max_diff, rel_diff;
int testPassed = 1;
double atol_j;
/********************** BEGIN EXECUTION ************************************/
/*
* Interpret command line arguments
*/
/* Loop over each command line option */
while((opt_let = getopt(argc, argv, "h")) != EOF) {
/* case over the option letter */
switch(opt_let) {
case 'h':
/* Usage info was requested */
print_usage();
exit(0);
default:
/* Default case. Error on unknown argument. */
fprintf(stderr, "ERROR in command line usuage:\n");
print_usage();
return 0;
} /* End "switch(opt_let)" */
} /* End "while((opt_let=getopt(argc, argv, "i")) != EOF)" */
if (optind != argc-2) {
print_usage();
exit(-1);
} else {
fileName1 = argv[argc-2];
fileName2 = argv[argc-1];
}
/*
* Print Out Header
*/
printf("\n");
printf("----------------------------------------------------------\n");
printf("csvdiff: CSVFile comparison utility program\n");
printf(" Version $Revision$\n");
printf(" Harry K. Moffat Div. 9114 Sandia National Labs\n");
printf(" \n");
printf(" First CSV File = %s\n", fileName1);
printf(" Second CSV file = %s\n", fileName2);
printf("----------------------------------------------------------\n");
printf("\n");
/*
* Open up the two ascii Files #1 and #2
*/
if (!(fp1 = fopen(fileName1, "r"))) {
fprintf(stderr,"Error opening up file1, %s\n", fileName1);
exit(-1);
}
if (!(fp2 = fopen(fileName2, "r"))) {
fprintf(stderr, "Error opening up file2, %s\n", fileName2);
exit(-1);
}
/*
* Obtain the size of the problem information: Compare between files.
*/
get_sizes(fp1, nTitleLines1, nColTitleLines1, nCol1, nDataRows1);
get_sizes(fp2, nTitleLines2, nColTitleLines2, nCol2, nDataRows2);
if (nTitleLines1 != nTitleLines2) {
printf("Number o Title Lines differ:, %d %d\n",nTitleLines1, nTitleLines2);
} else if (Debug_Flag) {
printf("Number of Title Lines in each file = %d\n", nTitleLines1);
}
if (nColTitleLines1 != nColTitleLines2) {
printf("Number of Column title lines differ:, %d %d\n", nColTitleLines1,
nColTitleLines2);
} else if (Debug_Flag) {
printf("Number of column title lines in each file = %d\n", nColTitleLines1);
}
/*
* Right now, if the number of data rows differ, we will punt.
* Maybe later we can do something more significant
*/
if (nDataRows1 != nDataRows2) {
printf("Number of Data rows in file1, %d, is different than file2, %d\n",
nDataRows1, nDataRows2);
exit(-1);
}
rewind(fp1);
rewind(fp2);
read_title(fp1, &title1, nTitleLines1);
read_title(fp2, &title2, nTitleLines2);
if (nTitleLines1 > 0 && nTitleLines2 > 0) {
if (strcmp(title1[0], title2[0]) != 0) {
printf("Titles differ:\n\t\"%s\"\n\t\"%s\"\n", title1[0], title2[0]);
} else if (Debug_Flag) {
printf("Title for each file: \"%s\"\n", title1[0]);
}
} else {
if (nTitleLines1 != nTitleLines2) {
if (nTitleLines1) {
printf("Titles differ: title for first file: \"%s\"\n",
title1[0]);
}
if (nTitleLines2) {
printf("Titles differ: title for second file: \"%s\"\n",
title2[0]);
}
}
}
/*
* Get the number of column variables in each file
*/
mixed_var = FALSE;
if (nCol1 != nCol2) {
printf("Number of column variables differ:, %d %d\n",
nCol1, nCol2);
mixed_var = TRUE;
} else if (Debug_Flag) {
printf("Number of column variables in both files = %d\n",
nCol1);
}
/*
* Read the names of the column variables
*/
read_colTitle(fp1, &ColMLNames1, nColTitleLines1, nCol1);
read_colTitle(fp2, &ColMLNames2, nColTitleLines2, nCol2);
ColNames1 = ColMLNames1[0];
ColNames2 = ColMLNames2[0];
/*
* Do a Comparison of the names to find the maximum number
* of matches.
*/
nColMAX = MAX(nCol1, nCol2);
compColList = mdp_alloc_int_2(nColMAX, 2, -1);
nColcomparisons = 0;
for (i = 0; i < nCol1; i++) {
found = FALSE;
for (j = 0; j < nCol2; j++) {
if (!strcmp(ColNames1[i], ColNames2[j])) {
compColList[nColcomparisons][0] = i;
compColList[nColcomparisons][1] = j;
nColcomparisons++;
found = TRUE;
if (i != j) mixed_var = 1;
break;
}
}
if (!found) {
printf("csvdiff WARNING Variable %s (%d) in first file not found"
" in second file\n", ColNames1[i], i);
}
}
for (j = 0; j < nCol2; j++) {
found = FALSE;
for (i = 0; i < nColcomparisons; i++) {
if (compColList[i][1] == j) found = TRUE;
}
if (! found) {
printf("csvdiff WARNING Variable %s (%d) in second file "
"not found in first file\n",
ColNames2[j], j);
}
}
/*
* Allocate storage for the column variables
*/
NVValues1 = mdp_alloc_dbl_2(nCol1, nDataRows1, 0.0);
NVValues2 = mdp_alloc_dbl_2(nCol2, nDataRows2, 0.0);
/*
* Read in the values to the arrays
*/
read_values(fp1, NVValues1, nCol1, nDataRows1);
read_values(fp2, NVValues2, nCol2, nDataRows2);
/*
* Compare the solutions in each file
*/
#define DGG_MODS
#ifdef DGG_MODS
double slope1, slope2, xatol;
#endif
for (k = 0; k < nColcomparisons; k++) {
i1 = compColList[k][0];
i2 = compColList[k][1];
curVarValues1 = NVValues1[i1];
curVarValues2 = NVValues2[i2];
max_diff = 0.0;
ndiff = 0;
atol_j = get_atol(curVarValues1, nDataRows1, gatol);
atol_j = MAX(atol_j, get_atol(curVarValues2, nDataRows2, gatol));
for (j = 0; j < nDataRows1; j++) {
#ifdef DGG_MODS
slope1 = 0.0;
slope2 = 0.0;
xatol = fabs(grtol * (NVValues1[0][j] - NVValues1[0][j-1]));
if (j > 0 && k > 0) {
slope1 = (curVarValues1[j] - curVarValues1[j-1])/
(NVValues1[0][j] - NVValues1[0][j-1]);
slope2 = (curVarValues2[j] - curVarValues2[j-1])/
(NVValues2[0][j] - NVValues2[0][j-1]);
}
if (diff_double_slope(curVarValues1[j], curVarValues2[j],
grtol, atol_j, xatol, slope1, slope2)) {
#else
if (diff_double(curVarValues1[j], curVarValues2[j], grtol, atol_j)) {
#endif
ndiff++;
rel_diff = calc_rdiff((double) curVarValues1[j],
(double) curVarValues2[j], grtol, atol_j);
if (rel_diff > max_diff) {
jmax = j;
max_diff = rel_diff;
}
if (ndiff < 10) {
printf("\tColumn variable %s at data row %d ", ColNames1[i1], j);
printf(" differ: %g %g\n", curVarValues1[j],
curVarValues2[j]);
}
}
}
/*
* Print out final results of nodal variable test
*/
if (ndiff > 0) {
printf(
"Column variable %s failed comparison test for %d occurances\n",
ColNames1[i1], ndiff);
printf(" Largest difference was at data row %d ", jmax);
printf(": %g %g\n", curVarValues1[jmax], curVarValues2[jmax]);
testPassed = 0;
} else if (Debug_Flag) {
printf("Column variable %s passed\n", ColNames1[i1]);
}
}
return(testPassed);
} /************END of main() *************************************************/
/*****************************************************************************/