Using ImageMagick and Tesseract to sort TIFFs on Windows
Recently I wrote a post about my search for a TIFF iFilter that would enable me to use VBScript to query a Windows Indexing Services server for file management.
I found that since OCR is never always 100% accurate, neither were my attempts at sorting all the inbound EMR faxes we get each day.
I did however, find Tesseract, a great product that was originally developed by HP and proprietary, and is now developed by Google and licensed under the Apache License v2, open source.
It is one of the most accurate open source OCR engines available. It is quite basic, and in the version you obtain from the project page, it only operates from the command line, and without the libtiff library, will only do it's work on un-compressed TIFFs. More information can be found on the project pages, and Wikipedia. Doing some scouring, I aso found a front-end, and ArchivistaBox, a complete document management system.
Image via Wikipedia
I'm using it in Windows, so I needed to do one of two things: recompile the sources in Visual Studio Express to include the libtiff libraries, or use the convert program that is bundled with ImageMagick. I tried to recompile but ran into too many issues, so fell back to ImageMagick.Following is the script I kick off daily to sort these:
Option Explicit Dim oShell Dim sCwhhm Dim sAqscan Dim sConv Dim sOpts Dim sTess Dim FSO Dim sFolder Dim sFile Dim sFileColl Dim inFile Dim sClinic Dim sDOS Dim sMoveTo Dim sNumBse, sNumMdn, sNumUnk Dim sStartTime, sFinishTime, sMail, sLog Dim i, x, l, arrFileLines() Set oShell = CreateObject("WScript.Shell") Set FSO = CreateObject("Scripting.FileSystemObject") Set sLog = FSO.CreateTextFile("C:\scripts\Log\CWHHM_FAXMOVELOG.TXT", True) sStartTime = Now() sLog.WriteLine(sStartTime & "---------------------------") ' /////////////////////////////////////////// ' Set the source and destination ' /////////////////////////////////////////// sCwhhm = "F:\Incomi~1\CWHHM\" sAqscan = "F:\Scanne~1\Aquarius\14CWHHM\" ' /////////////////////////////////////////// ' Set the convert program and flags sConv = "C:\Progra~1\ImageM~1.3-Q\convert.exe" sOpts = "-density 150x150 -compress none" ' /////////////////////////////////////////// ' Set the tesseract program sTess = "C:\Progra~1\tesser~1.00\tesser~1.exe" ' /////////////////////////////////////////// ' First clean up the directory so we can ' build a list of all tifs ' /////////////////////////////////////////// If (FSO.FileExists(sCwhhm & "Thumbs.db")) Then FSO.DeleteFile(sCwhhm & "Thumbs.db") End If If (FSO.FileExists(sCwhhm & "tesseract.log")) Then FSO.DeleteFile(sCwhhm & "tesseract.log") End If ' /////////////////////////////////////////// ' Set the folder and file collection ' /////////////////////////////////////////// Set sFolder = FSO.GetFolder(sCwhhm) Set sFileColl = sFolder.Files ' /////////////////////////////////////////// ' Loop over remaining files ' /////////////////////////////////////////// For Each sFile in sFileColl ' /////////////////////////////////////////// ' Convert the file to an uncompressed format oShell.Run sConv & " " & sCwhhm & sFile.Name & " " & sOpts & " " & sCwhhm & "out.tif", 0, true ' /////////////////////////////////////////// ' Now OCR it, then delete the old tif oShell.Run sTess & " " & sCwhhm & "out.tif " & sCwhhm & "ocr", 0, true FSO.DeleteFile(sCwhhm & "out.tif") ' /////////////////////////////////////////// ' Read the file in to our array Set inFile = FSO.OpenTextFile(sCwhhm & "ocr.txt", 1) i = 0 Redim arrFileLines(i) Do Until inFile.AtEndOfStream Redim Preserve arrFileLines(i) arrFileLines(i) = inFile.ReadLine i = i + 1 Loop ' close and delete the file, we don't need it anymore inFile.Close FSO.DeleteFile(sCwhhm & "ocr.txt") ' /////////////////////////////////////////// ' Now read the contents of the file, from the top down, For l = 0 to UBound(arrFileLines) ' /////////////////////////////////////////// ' Check for the header If UCase(Left(arrFileLines(l), 6)) = "CENTER" Then ' /////////////////////////////////////////// ' Test for the progress line on the next line, 3 If UCase(Left(arrFileLines(l + 1), 8)) = "PROGRESS" Then ' /////////////////////////////////////////// ' We are pretty sure the next line, 4, is a clinic name sClinic = UCase(arrFileLines(l + 2)) Select Case sClinic Case "MERIDIAN CLINIC" sClinic = "MDN\" sNumMdn = sNumMdn + 1 Case "BOISE CLINIC" sClinic = "BSE\" sNumBse = sNumBse + 1 Case Else sClinic = "UNK\" sNumUnk = sNumUnk + 1 End Select ' /////////////////////////////////////////// ' Get the DOS, which is supposed to be on line ' 8 of a progress report, but we'll check a couple ' to be sure. We are on line 2, CENTER. For x = 4 to 8 sDOS = UCase(Left(arrFileLines(l + x), 4)) If sDOS = "DATE" Then sDOS = Trim(Replace(Mid(arrFileLines(l + x), 17, 11), "/", "")) sMoveTo = sAqscan & sClinic & sDOS & "\" If FSO.FolderExists(sMoveTo) Then FSO.MoveFile sCwhhm & sFile.Name, sMoveTo Else On Error Resume Next FSO.CreateFolder(sMoveTo) If Err.Number <> 0 Then sLog.WriteLine("ERROR: " & Err.Number & " - " & Err.Description) sLog.WriteLine("ERROR: " & sFile.Name & " was going to be moved to " & sMoveTo) Set sMail = FSO.CreateTextFile("C:\Temp\TEMPMAIL.TXT", True) sMail.WriteLine("TO: " & Chr(34) & "JOHN CROSON" & Chr(34) & "") sMail.WriteLine("FROM: " & Chr(34) & "IHBS Administrator" & Chr(34) & " ") sMail.WriteLine("SUBJECT: CWHHM EMR File Sorting") sMail.WriteBlankLines(1) sMail.WriteLine("There has been an error. Check c:\scripts\log\CWHHM_FAXMOVELOG.TXT") sMail.Close FSO.MoveFile "C:\Temp\TEMPMAIL.TXT", "\\ibsts\c$\Inetpub\mailroot\Pickup\" End If FSO.MoveFile sCwhhm & sFile.Name, sMoveTo End If Exit For End If Next Else ' ////////////////////////////////////////// ' This must be another file type, step to the next line sClinic = UCase(Left(arrFileLines(l + 1), 5)) Select Case sClinic Case "MERID" sClinic = "MDN\" sNumMdn = sNumMdn + 1 Case "BOISE" sClinic = "BSE\" sNumBse = sNumBse + 1 Case Else sClinic = "UNK\" sNumUnk = sNumUnk + 1 End Select ' /////////////////////////////////////////// ' Get the DOS, which could be on line ' 7 through 12. For x = 5 To 15 sDOS = UCase(Left(arrFileLines(l + x), 4)) If sDOS = "DATE" Then sDOS = Trim(Replace(Mid(arrFileLines(l + x), 17, 11), "/", "")) sMoveTo = sAqscan & sClinic & sDOS & "\" If FSO.FolderExists(sMoveTo) Then FSO.MoveFile sCwhhm & sFile.Name, sMoveTo Else On Error Resume Next FSO.CreateFolder(sMoveTo) If Err.Number <> 0 Then sLog.WriteLine("ERROR: " & Err.Number & " - " & Err.Description) sLog.WriteLine("ERROR: " & sFile.Name & " was going to be moved to " & sMoveTo) Set sMail = FSO.CreateTextFile("C:\Temp\TEMPMAIL.TXT", True) sMail.WriteLine("TO: " & Chr(34) & "JOHN CROSON" & Chr(34) & " ") sMail.WriteLine("FROM: " & Chr(34) & "IHBS Administrator" & Chr(34) & " ") sMail.WriteLine("SUBJECT: CWHHM EMR File Sorting") sMail.WriteBlankLines(1) sMail.WriteLine("There has been an error. Check c:\scripts\log\CWHHM_FAXMOVELOG.TXT") sMail.Close FSO.MoveFile "C:\Temp\TEMPMAIL.TXT", "\\ibsts\c$\Inetpub\mailroot\Pickup\" End If FSO.MoveFile sCwhhm & sFile.Name, sMoveTo End If Exit For End If Next sClinic = "" sDOS = "" End If '<- Progress Test Exit For End If '<- Header Test Next Next sFinishTime = Now() sLog.WriteLine(sFinishTime & "---------------------------") sLog.Close ' ////////////////////////////// ' Now send an email notification that the ' files are ready. Set sMail = FSO.CreateTextFile("C:\Temp\TEMPMAIL.TXT", True) sMail.WriteLine("TO: " & Chr(34) & "JOHN CROSON" & Chr(34) & " ") sMail.WriteLine("FROM: " & Chr(34) & "IHBS Administrator" & Chr(34) & " ") sMail.WriteLine("SUBJECT: CWHHM EMR File Sorting") sMail.WriteBlankLines(1) sMail.WriteLine("CWHHM Electronic Medical Record sorting is finished.") sMail.WriteBlankLines(1) sMail.WriteLine("Start time was " & sStartTime & ". Finish time was " & sFinishTime & ".") sMail.WriteBlankLines(1) sMail.WriteLine("The number of Boise files was " & sNumBse & ".") sMail.WriteBlankLines(1) sMail.WriteLine("The number of Meridian files was " & sNumMdn & ".") sMail.WriteBlankLines(1) If sNumUnk > 0 Then sMail.WriteLine("The number of Unkown files was " & sNumUnk & ".") End If sMail.Close FSO.MoveFile "C:\Temp\TEMPMAIL.TXT", "\\ibsts\c$\Inetpub\mailroot\Pickup\"
Comments
What a great script!
I've been just getting into vbscript and I'm sure I can learn a lot just by dissecting pieces or your script.
Interesting method for sending mail.
Awesome work.
-Kyle
I do love to tinker...