Tuesday, June 20, 2017

convert csv to xml

Hello,

Since i have nothing to do in my free time, i answered a post in expert exchange to convert a CSV to XML

Here is my code, it's a VBS script

It does not support everything the CSV can contain

The input CSV was sometimes with double quotes, sometimes not
Sometimes there was carriage return in a field, sometimes 2

So i support: double quotes in CSV
and up to two carriage return in a field
You can change it easily to support more carriage returns in a field

The files must all be in same folder

The log (file starting with ZZZ in same folder) will tell you if something went wrong


'=== CSV to xml
'=== by sergefournier@hotmail.com 2017-06-20

'=== save thsi script as: csv2xml.VBS
'=== input file must be sample1.csv in the same directory as the script (same folder)
'=== outputfile will be sampleout1.xml
'=== zzz_troubleshooting.txt will tell you what was not processed (lines)

'=== support 2 carriage return in a field, but not 3 :P

Set objFSO    = CreateObject("Scripting.FileSystemObject")

'=== actual drive, actual directory, and "\"
thepath=WScript.ScriptFullName
p = instrRev(thepath,"\")
basedir  = left(thepath,p)
filnam = right(thepath,len(thepath)-p)

logall = 1

if logall=1 then
   '=== debug log
   file02 = basedir & "zzz_troubleshooting.txt"
   on error resume next
   Set Fil02 = objFSo.OpenTextFile(file02, 2, true)
   on error goto 0
end if

if logall=1 then
   fil02.WriteLine date & " " & time & " START"
end if


'=== input (txt file, not using ODBC txt driver)
filename01 = basedir & "sample1.csv"
Set File01 = objFSo.OpenTextFile(filename01, 1, true)


'=== read filed names (1st line)


if file01.AtEndOfStream <> true then
line01 = file01.readline

'=== array with all field names
arr01 = split(line01,",")
for i = 0 to ubound(arr01)
arr01(i) = trim(arr01(i)) '=== remove spaces
arr01(i) = replace(arr01(i), " ", "")
arr01(i) = replace(arr01(i), "/", "")
fil02.WriteLine date & " " & time & " fieldname: " & arr01(i)
next

'=== spaces in xml name are not good


'=== output xml
'=== create xml
Set xmlDoc = CreateObject("Microsoft.XMLDOM")
Set objRoot = xmlDoc.createElement("data")
xmlDoc.appendChild objRoot

Set objIntro = xmlDoc.createProcessingInstruction("xml","version='1.0'")
xmlDoc.insertBefore objIntro,xmlDoc.childNodes(0)

xmlDoc.Save basedir & "sampleout1.xml"


'=== add a record
Set xmlDoc = CreateObject("Microsoft.XMLDOM")

xmlDoc.Async = "False"
xmlDoc.Load(basedir & "sampleout1.xml")

Set objRoot = xmlDoc.documentElement
linecount01 = 0

Do While file01.AtEndOfStream <> True
Set objRecord = xmlDoc.createElement("client")
objRoot.appendChild objRecord

line01 = file01.readline
'=== split value in CSV
arr02 = split(line01,",")

x = 0 '=== number of elements in new array to merge elements with double quotes
redim arr03(x)

'2017-06-20 21:43:50 DATA before: 10400,Pep,Pepmiller,Pep.Pepmiller@RalphLauren.com,"2800 Routh Street, Suite 260",Dallas,TX,75201
'2017-06-20 21:43:50 DATA after : ,10400,Pep,Pepmiller,Pep.Pepmiller@RalphLauren.com,2800 Routh Street"2800 Routh Street Suite 260,TX,75201
if ubound(arr02) <> 0 then
if ubound(arr01) <> ubound(arr02) then

'=== do we have a double quote and a carriage return before we meet the closing double quote
doublequoteanomaly01 = 1
for i = 0 to ubound(arr02)
'=== search for a closing double quote, in case a carriage return separate the line
if right(arr02(i),1) = """" then
'=== found a closing double quote, no anomaly
doublequoteanomaly01 = 0
end if
next

if doublequoteanomaly01 = 1 then
fil02.WriteLine date & " " & time & " --- found double quote alone, will add next line: " & linecount01
'=== we have only one double quote in the line, wich mean there is a carriage return after the double quote
'=== we have to read the next line and merge both lines
if file01.AtEndOfStream <> true then
line03 = file01.readline
linecount01 = linecount01 + 1
'=== data was on two line, we add a space instead of a carriage return
'=== xml will support a carriage return special code
line01 = line01 & " " & line03

if instr(line03, """") = 0 then
fil02.WriteLine date & " " & time & " --- final double quote not in line, reading another line: " & linecount01
if file01.AtEndOfStream <> true then
line03 = file01.readline
linecount01 = linecount01 + 1
line01 = line01 & " " & line03
end if
end if

arr02 = split(line01,",")
end if
end if



if ubound(arr01) <> ubound(arr02) then
'=== a data might have quote to support a comma inside of it
'fil02.WriteLine date & " " & time & " DATA before: " & line01

line02 = ""
for i=0 to ubound(arr02)
if i < ubound(arr02) then
line02 = line02 & arr02(i) & ","
else
line02 = line02 & arr02(i)
end if
next

'fil02.WriteLine date & " " & time & " DATA befor2: " & line02

skip01 = 0
for i = 0 to ubound(arr02)

newdata01 = arr02(i)


if left(arr02(i),1) = """" then
'=== double quote found
newdata01 = right(newdata01,len(Arr02(i))-1) & ","
i3 = i + 1
for i2 = i3 to ubound(arr02)
if right(arr02(i2),1) = """" then
newdata01 = newdata01 & left(arr02(i2),len(arr02(i2))-1)
i = i + 1
'=== end double quote found, we exit the for i2
exit for
else
newdata01 = newdata01 & arr02(i2)
i = i + 1
end if
next
end if


redim preserve arr03(x)
arr03(x) = newdata01
x = x + 1

next
arr02 = arr03

line02 = ""
for i=0 to ubound(arr03)
if i < ubound(arr03) then
line02 = line02 & arr03(i) & ","
else
line02 = line02 & arr03(i)
end if
next
'fil02.WriteLine date & " " & time & " DATA after : " & line02
else
'=== reading second line was enough for element to fit in numbers (data number = fields numbers)
'=== there was no comma, justye a carriage return
'=== just remove double quotes in the splitted elements
for i=0 to ubound(arr02)
arr02(i) = replace(arr02(i),"""","")
next

end if

end if

if ubound(arr01) = ubound(arr02) then

for i = 0 to ubound(arr01)

Set objFieldValue = xmlDoc.createElement(arr01(i))
'objfieldvalue.SetAttribute "displaynamefra", "Numéro"
objFieldValue.Text = arr02(i)
objRecord.appendChild objFieldValue
next
else
fil02.WriteLine date & " " & time & " ERROR line: " & linecount01
fil02.WriteLine date & " " & time & " the number of columns (fields) does not correspond to the number of data"
fil02.WriteLine date & " " & time & " ubound arr01: " & ubound(arr01)
fil02.WriteLine date & " " & time & " ubound arr02: " & ubound(arr02)
fil02.WriteLine date & " " & time & " data: " & line01
end if
else
fil02.WriteLine date & " " & time & " ERROR line empty: " & line02
end if
linecount01 = linecount01 + 1

loop

xmlDoc.Save basedir & "sampleout1.xml"
else
msg01 = "ERROR input file is empty"
msgbox(msg01)
end if


file01.close

2 comments: