-
March 25th, 2009, 02:37 AM
#1
Regular expression does not parse correctly
Hi!
I'm having a hard time making my little parse-function work. I have a CSV file which works just fine when parsing, and i get all the rows i should get. But when i try to use a tab-delimited file instead, i only get 2700 out of 3500 records parsed into the containing datatable.
When parsing a CSV file the following statement is used:
ParseCSV(String inputString,0);
And when parsing a tab-delimited file the following statement is used:
ParseCSV(String inputString,1);
The second parameter simply states if its a tab- or csv-delimited file we're dealing with.
And here comes the function that parses the CSV into a datatable.. i have a hunch that the error is in the regex - im no codeguru in that respect.
public static DataTable ParseCSV(string inputString, int type)
{
//Remove headings - it messes up the system.
inputString = inputString.Substring(252);
DataTable dt = new DataTable();
// declare the Regular Expression that will match versus
the input string
Regex re;
if (type == 0)
re = new
Regex("((?<field>[^\",\\r\\n]+)|\"(?<field>([^\"]|\"\")+)\")(,|(?<rowbreak>\\r\\n|\\n|$))");
else if (type == 1)
re = new
Regex("((?<field>[^\"\\t\\r\\n]+)|\"(?<field>([^\"]|\"\")+)\")(\\t|(?<rowbreak>\\r\\n|\\n|\n|$))");
else
re = new Regex("");
ArrayList colArray = new ArrayList();
ArrayList rowArray = new ArrayList();
int colCount = 0;
int maxColCount = 0;
string rowbreak = "";
string field = "";
MatchCollection mc = re.Matches(inputString);
foreach (Match m in mc)
{
// retrieve the field and replace two double-quotes
with a single double-quote
field = m.Result("${field}").Replace("\"\"", "\"");
rowbreak = m.Result("${rowbreak}");
if (field.Length > 0)
{
colArray.Add(field);
colCount++;
}
if (rowbreak.Length > 0)
{
// add the column array to the row Array List
rowArray.Add(colArray.ToArray());
// create a new Array List to hold the field values
colArray = new ArrayList();
if (colCount > maxColCount)
maxColCount = colCount;
colCount = 0;
}
}
if (rowbreak.Length == 0)
{
// this is executed when the last line doesn't
// end with a line break
rowArray.Add(colArray.ToArray());
if (colCount > maxColCount)
maxColCount = colCount;
}
// create the columns for the table
for (int i = 0; i < maxColCount; i++)
dt.Columns.Add(String.Format("col{0:000}", i));
// convert the row Array List into an Array object for easier access
Array ra = rowArray.ToArray();
for (int i = 0; i < ra.Length; i++)
{
// create a new DataRow
DataRow dr = dt.NewRow();
// convert the column Array List into an Array object
for easier access
Array ca = (Array)(ra.GetValue(i));
// add each field into the new DataRow
for (int j = 0; j < ca.Length; j++)
dr[j] = ca.GetValue(j);
// add the new DataRow to the DataTable
dt.Rows.Add(dr);
}
// in case no data was parsed, create a single column
if (dt.Columns.Count == 0)
dt.Columns.Add("NoData");
//Calculate how many line-breaks there are (and thus how many records there should be) vs. how many rows there actually are..
string exp = "\n";
MessageBox.Show("Num Correct:" +
Regex.Matches(inputString, exp).Count + " Num Realized: " +
dt.Rows.Count);
return dt;
}
Tags for this Thread
Posting Permissions
- You may not post new threads
- You may not post replies
- You may not post attachments
- You may not edit your posts
-
Forum Rules
|
Click Here to Expand Forum to Full Width
|