Optical Character Recognition (OCR) Script

I wanted to learn more about using the Vision framework to perform OCR on images and decided to create this thread with some of my test scripts. As a starting point, I was fortunate to find a script written by Stephen Kaplan on the Stackoverflow forum, which can be found here.

The following script gets text from a single image file:

use framework "Foundation"
use framework "Vision"
use scripting additions

set theFile to POSIX path of (choose file of type {"public.image"})
set theText to getText(theFile)

on getText(theFile)
	set theFile to current application's |NSURL|'s fileURLWithPath:theFile
	set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:theFile options:(missing value)
	set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()
	requestHandler's performRequests:(current application's NSArray's arrayWithObject:(theRequest)) |error|:(missing value)
	set theResults to theRequest's results()
	set theArray to current application's NSMutableArray's new()
	repeat with aResult in theResults
		(theArray's addObject:(((aResult's topCandidates:1)'s objectAtIndex:0)'s |string|()))
	end repeat
	return (theArray's componentsJoinedByString:linefeed) as text
end getText
2 Likes

This script returns text from an image on the clipboard.

use framework "AppKit" -- required for NSPasteboard
use framework "Foundation"
use framework "Vision"
use scripting additions

try
	set theText to getText()
on error
	display alert "An error has occurred" message "This often happens when an image is not found on the clipboard"
end try

on getText()
	set thePasteboard to current application's NSPasteboard's generalPasteboard()
	set imageData to thePasteboard's dataForType:(current application's NSPasteboardTypeTIFF)
	set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithData:imageData options:(missing value)
	set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()
	requestHandler's performRequests:(current application's NSArray's arrayWithObject:(theRequest)) |error|:(missing value)
	set theResults to theRequest's results()
	set theArray to current application's NSMutableArray's new()
	repeat with aResult in theResults
		(theArray's addObject:(((aResult's topCandidates:1)'s objectAtIndex:0)'s |string|()))
	end repeat
	return (theArray's componentsJoinedByString:linefeed) as text
end getText

Here’s a version in JavaScript that works with images and PDFs (not all of which contain text layers). Please see the comments for details. And note that this script does not sort the recognized text by reading direction, so some sorting errors may occur (or perhaps Apple has fixed their framework meanwhile so that that’s no longer a problem).

#!/usr/bin/osascript -l JavaScript 
ObjC.import('Vision');  /* OCR */
ObjC.import('PDFKit'); /* PDF */
ObjC.import('AppKit'); /* NSImage */
ObjC.import('CoreGraphics')

/* Function to run OCR on a single image of class "CGImage" */
function textFromImage(image) {   
  const error = $(); /* So Objective-C gets a "reference" */
  /* Specify the languages you want to recognize */
  const languageCodes = ["de-DE", "en-US"]; 
  const recognizedLanguages = 
    languageCodes.map( lang => $(lang)); /* Create Objective-C strings from JS ones */
  
  /* Create the Vision request and configure it */
  const request = $.VNRecognizeTextRequest.alloc.init;
  request.setRecognitionLanguages($(recognizedLanguages));
  request.usesLanguageCorrection = true;
  
  /* Build request array for the handler */
  const reqArray = $([request]);
  
  /* Create the request handler */
  const imageRequestHandler = 
    $.VNImageRequestHandler.alloc.initWithCGImageOptions(image, {});
  
  /* Start OCR  by calling "performRequestsError" on the handler */
  const success = imageRequestHandler.performRequestsError(reqArray, error);
  if (!success) {
    console.log($(error.localizedDescription).js)
    return [];
  } else {
    /* Write the text fragments into the array "result" */
    const resultArray = request.results.js; /* convert the NSArray to a JS Array with ".js" */
    const result = resultArray.map(r => {
      const topCandidate = r.topCandidates(1).js[0];
      return topCandidate.string.js;
    });
    return result;
  }
}

/* Function to OCR a file, expects a path in POSIX format */
function OCRFile(filename) {
  const error = $();
  const uti = $();
  const nil = $();
  /*
  Create the URL and determine UTI for the file
  */
  const fileURL = $.NSURL.fileURLWithPath(filename);
  fileURL.getResourceValueForKeyError(uti, $.NSURLTypeIdentifierKey, error);

  let result = [];
  if (/jpeg|image|png/.test(uti.js)) {
    /* Images with one "page":
       Load the image from the file and send it to the OCR function "textFromImage"
    */
    const NSimage = $.NSImage.alloc.initWithContentsOfURL(fileURL);
    const image = NSimage.CGImageForProposedRectContextHints(nil, nil, nil);
    result.push(textFromImage(image));
  } else if (/pdf/.test(uti.js)) {
    /* PDF file with possibly more than 1 page
       Create a PDFDocument and for each page
         create its TIFF image
         and pass that on to the OCR function "textFromImage"
    */
    const pdfDoc = $.PDFDocument.alloc.initWithURL(fileURL);
    for (let i = 0; i< pdfDoc.pageCount; i++) {
      const rawData = pdfDoc.pageAtIndex(i).dataRepresentation;
      const image = $.NSImage.alloc.initWithData(rawData);
      const tiff = image.TIFFRepresentation;
      result.push(textFromImage(tiff));
    }
  }
  return result;
}

/* Main function: Runs OCRFile on a fixed file 
   Output is written to stderr when the script is run in Script Editor or
   via osascript. Can easily be saved to the clipboard or written to an external file */
(() => {
  const result = OCRFile('POSIX PATH TO FILE');
  console.log(`${result.length} Pages `);
  result.forEach((page,index) => {
    console.log(`Page ${index + 1} -->`)
    page.forEach((paragraph, j) => console.log(`    ${j}: ${paragraph}`));
  })
})()
1 Like

The script contained below places the cursor in mouse-select mode, which allows the user to select an area of an image containing text. The user can toggle mouse-select mode to window-select mode by pressing the space bar. The script then converts text within the selection or window to a string.

Because the script uses the screencapture utility, the user can select almost any area of the screen and the script will work, although there’s probably no reason to do this.

use framework "Foundation"
use framework "Vision"
use scripting additions

set theText to getImage()

on getImage()
	set tempFolder to current application's NSTemporaryDirectory()
	set tempFile to tempFolder's stringByAppendingPathComponent:"OCR Temp File.png"
	set fileManager to current application's NSFileManager's defaultManager()
	do shell script "screencapture -ioxa " & quoted form of (tempFile as text)
	try
		set theText to getText(tempFile)
		(fileManager's removeItemAtPath:tempFile |error|:(missing value))
		return theText
	on error
		display alert "An error has occurred" message "No text found in selection or operation cancelled"
		(fileManager's removeItemAtPath:tempFile |error|:(missing value))
	end try
end getImage

on getText(theFile)
	set theFile to current application's |NSURL|'s fileURLWithPath:theFile
	set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:theFile options:(missing value)
	set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()
	requestHandler's performRequests:(current application's NSArray's arrayWithObject:(theRequest)) |error|:(missing value)
	set theResults to theRequest's results()
	set theArray to current application's NSMutableArray's new()
	repeat with aResult in theResults
		(theArray's addObject:(((aResult's topCandidates:1)'s objectAtIndex:0)'s |string|()))
	end repeat
	return (theArray's componentsJoinedByString:linefeed) as text
end getText
2 Likes

There are numerous options that can be added to the above scripts, and they should be placed after the following existing line:

set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()

One option is the text recognition level, which can be fast or accurate. The default is accurate, and this can be changed by adding the following line:

theRequest's setRecognitionLevel:(current application's VNRequestTextRecognitionLevelFast)

The documentation describes the recognition levels as follows:

Fast. The fast path uses the framework’s character-detection capabilities to find individual characters, and then uses a small machine learning model to recognize individual characters and words. This approach is similar to traditional optical character recognition (OCR).

Accurate. The accurate path uses a neural network to find text in terms of strings and lines, and then performs further analysis to find individual words and sentences. This approach is much more in line with how humans read text.

A second option is language recognition, which can be changed by adding the following line. The default is English and more than one language can be added, in which instance the languages are used in the order they are listed. The specified languages are ISO language codes.

theRequest's setRecognitionLanguages:{"en", "fr"}

A third option is language correction, which can be changed with the following line:

theRequest's setUsesLanguageCorrection:false

This is described in the documentation as:

When this value is YES, Vision applies language correction during the recognition process. Disabling this property returns the raw recognition results, which provides performance benefits but less accurate results.

The Apple documentation for the VNRecognizeTextRequest class can be found here. It can be a bit obtuse but can be helpful just to see some of the other available options.

1 Like

I did some anecdotal testing to see which of the above options would return the best results. I am English-speaking only and did not test the language recognition setting, which means that the language was set to English.

The first image was a screenshot of four paragraphs of an online news article with smallish print. Accurate text recognition (both with and without language correction) returned the text with no errors. Fast text recognition (both with and without language correction) accurately returned text but got punctuation wrong in a few spots.

The second image was a screenshot of an AppleScript in TextEdit, and none of the options did a really good job. Accurate text recognition with language correction disabled did the best job with only a few mistakes. Accurate text recognition with language correction enabled did not do well–it inserted spaces and deleted punctuation marks in an apparent attempt to make the script into a readable document. Fast text recognition (both with and without language correction) did an abysmal job in every respect.

The third image was a screenshot of the above text viewed in a text editor with the text size set to large. Accurate text recognition (both with and without language correction) made no mistakes, and fast text recognition (both with and without language correction) made just a few miscellaneous mistakes.

So, for best results overall, I will use accurate text recognition with language correction disabled.

1 Like

Peavine, I have enjoyed reading your investigations into OCR using the Vision framework and have learned so much about using this framework via AppleScript. Thanks for sharing your ideas.

1 Like

akim. You’re most welcome. This has been a fun project for me.

Thanks for the link. That’s a cool project!

Fredrik71. Thanks for the link.

The following is my first attempt to get text from a PDF with OCR. As presently written, it works with one page only and operates by converting the PDF page into image data, which is then converted to text with the Vision framework.

This works with a PDF that contains easily-recognized text but returns poor results with anything else. I’ll work on that.

use framework "AppKit" -- for NSImage
use framework "Foundation"
use framework "Quartz"
use framework "Vision"
use scripting additions

on main()
	set thePage to 1
	set theFile to POSIX path of (choose file of type {"pdf"})
	set imageData to getImageData(theFile, thePage)
	set theText to getText(imageData)
end main

on getImageData(theFile, thePage)
	set theFile to current application's |NSURL|'s fileURLWithPath:(theFile)
	set theDocument to current application's PDFDocument's alloc()'s initWithURL:theFile
	set thePage to (theDocument's pageAtIndex:(thePage - 1))
	set theData to (current application's NSImage's alloc()'s initWithData:(thePage's dataRepresentation()))
	return theData's TIFFRepresentation()
end getImageData

on getText(imageData)
	set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithData:imageData options:(missing value)
	set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()
	theRequest's setUsesLanguageCorrection:false
	requestHandler's performRequests:(current application's NSArray's arrayWithObject:(theRequest)) |error|:(missing value)
	set theResults to theRequest's results()
	set theArray to current application's NSMutableArray's new()
	repeat with anObservation in theResults
		set theResult to ((anObservation's topCandidates:1)'s objectAtIndex:0)'s |string|()
		(theArray's addObject:theResult)
	end repeat
	return (theArray's componentsJoinedByString:linefeed) as text
end getText

main()

Hi Peavine
This script works great in my workflow, just wondering if this can work to OCR a table [pdf/ image] and output as csv ?
Cheers

I’d suggest giving it a try. It might happen that the OCR works by columns, though. And the process will probably not be very robust, depending on the content of the table cells.

In which order – first cell in first row, 2nd cell in first row … or first cell in first row, first cell in 2nd row?
I’m asking because in many PDFs I receive, it’s the second one.

Edit Not really important though: one could use the rectangles associated with every text blob to sort the text in which ever way.

@One208. My PDF script (and my script in post 4) can be modified to return csv text from a table in a PDF by deleting line 1 below from my script and replacing it with line 2:

-- PDF Script
return (theArray's componentsJoinedByString:linefeed) as text
return (theArray's componentsJoinedByString:",") as text

-- Script in Post 4
return (theArray's componentsJoinedByString:linefeed)
return (theArray's componentsJoinedByString:",")

I tested this with a Numbers spreadsheet that I saved as a PDF, and the order of the returned text was cells A1 to A10, B1 to B10, and so on. This would not seem to be very useful, but I don’t know how to change that.

Right now I’m working to get more accurate results with my PDF OCR script (which is very substandard). Afterwards, I’ll look at the manner and order in which text is returned from an image or PDF.

By looking at the bounding box of each text fragment and sorting them by y, then x coordinates. It’s feasible, but it might be a bit convoluted in AppleScript.

2 Likes

The following is my revised script that performs OCR on one page of a PDF. This generally works well, but there were instances in my testing where the script found no text. One workaround is to use the screencapture script in post 4; another is to save the PDF as a PNG image and to use the script in post 1 above.

use framework "AppKit"
use framework "Foundation"
use framework "Quartz"
use framework "Vision"
use scripting additions

set pageNumber to 1 -- user set as desired
set imageResolution to 300 -- user test different values
set theFile to POSIX path of (choose file of type {"com.adobe.pdf"})
set imageData to getImageData(theFile, pageNumber, imageResolution)
set theText to getText(imageData)

on getImageData(theFile, pageNumber, thePPI) -- based on a handler by Shane Stanley
	set theFile to current application's |NSURL|'s fileURLWithPath:theFile
	set theDocument to current application's PDFDocument's alloc()'s initWithURL:theFile
	set thePage to (theDocument's pageAtIndex:(pageNumber - 1))
	set pageSize to (thePage's boundsForBox:(current application's kPDFDisplayBoxMediaBox))
	set pageWidth to current application's NSWidth(pageSize)
	set pageHeight to current application's NSHeight(pageSize)
	set pixelWidth to (pageWidth * thePPI / 72) div 1
	set pixelHeight to (pageHeight * thePPI / 72) div 1
	set pdfImageRep to (current application's NSPDFImageRep's imageRepWithData:(thePage's dataRepresentation()))
	set newRep to (current application's NSBitmapImageRep's alloc()'s initWithBitmapDataPlanes:(missing value) pixelsWide:pixelWidth pixelsHigh:pixelHeight bitsPerSample:8 samplesPerPixel:4 hasAlpha:yes isPlanar:false colorSpaceName:(current application's NSDeviceRGBColorSpace) bytesPerRow:0 bitsPerPixel:32)
	current application's NSGraphicsContext's saveGraphicsState()
	current application's NSGraphicsContext's setCurrentContext:(current application's NSGraphicsContext's graphicsContextWithBitmapImageRep:newRep)
	pdfImageRep's drawInRect:{origin:{x:0, y:0}, |size|:{width:pixelWidth, height:pixelHeight}} fromRect:(current application's NSZeroRect) operation:(current application's NSCompositeSourceOver) fraction:1.0 respectFlipped:false hints:(missing value)
	current application's NSGraphicsContext's restoreGraphicsState()
	return newRep's TIFFRepresentation()
end getImageData

on getText(imageData)
	set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithData:imageData options:(missing value)
	set theRequest to current application's VNRecognizeTextRequest's alloc()'s init()
	requestHandler's performRequests:(current application's NSArray's arrayWithObject:(theRequest)) |error|:(missing value)
	set theResults to theRequest's results()
	set theArray to current application's NSMutableArray's new()
	repeat with aResult in theResults
		(theArray's addObject:(((aResult's topCandidates:1)'s objectAtIndex:0)'s |string|()))
	end repeat
	return (theArray's componentsJoinedByString:linefeed) as text
end getText
1 Like

Thanks for responding, look forward to update. Have a good day

I spent some time on this and couldn’t get it to work. A sticking point for me was getting the coordinates of each corner of each bounding box. However, even if I had accomplished this, I think the approach I envisioned would be so long and involved and unreliable as not to be worth the effort.

I’d use only the lower left corner, not all of them. Assuming, of course, that there’s no skew in the document, or only a small one.

1 Like