Use case-change codes in replacement templates with ASObjC regex

This is a tidy-up of a handler I wrote six years ago for a topic in the LNS fora and subsequently forgot until I was reminded of it yesterday by a similar topic here on MacScripter. It allows case-change codes (similar, I believe, to those in a regex implementation offered by Just Great Software) to be used in replacement templates with ASObjC’s regex, which is the ICU flavour and doesn’t support them itself. The four recognised codes are the upper-case renditions of:

U - upper-case an entire capture group
L - lower-case an entire capture group
I - title-case an entire capture group
F - upper-case the first character of a capture group and lower-case the others.

(I don’t know why “I” and “F”, but apparently that’s what they are in JGS regex!) If used, these letters should be inserted between the “$” metacharacter and the relevant capture group number in the replacement template. (eg. “$U1”.) Otherwise search-and-replace is handled in (hopefully) the normal ICU fashion.

use AppleScript version "2.4" -- Yosemite (10.10) or later
use framework "Foundation"
-- use scripting additions

on regexReplace(txt, regexPattern, replacementTemplate)
	set |⌘| to current application
	set mainText to |⌘|'s class "NSMutableString"'s stringWithString:(txt)
	set regexPattern to |⌘|'s class "NSString"'s stringWithString:(regexPattern)
	set replacementTemplate to |⌘|'s class "NSString"'s stringWithString:(replacementTemplate)
	
	-- Get the input regex's matches in the input text.
	set mainRegex to |⌘|'s class "NSRegularExpression"'s regularExpressionWithPattern:(regexPattern) options:(0) |error|:(missing value)
	set mainRegexMatches to mainRegex's matchesInString:(mainText) options:(0) range:({0, mainText's |length|()})
	set mainRegexMatchCount to (count mainRegexMatches)
	-- Act on any found.
	if (mainRegexMatchCount > 0) then
		-- Use another regex to find any back-references in the replacement template. (Unescaped dollar sign, optional case code, 1 or 2 digits.)
		set backRefRegex to |⌘|'s class "NSRegularExpression"'s regularExpressionWithPattern:("(?<!\\\\)(\\$[ULIF]?)(\\d{1,2})") options:(0) |error|:(missing value)
		set backRefData to (backRefRegex's matchesInString:(replacementTemplate) options:(0) range:({0, replacementTemplate's |length|()})) as list
		-- If there are any, get their ranges in the template, their ranges, "prefices" ("$" and any case-change code), and capture group numbers.
		set backRefCount to (count backRefData)
		if (backRefCount > 0) then
			set numberOfCaptureGroupsInMainRegex to (mainRegex's numberOfCaptureGroups()) -- NOT, as in Xcode literature, this minus 1.
			repeat with i from 1 to backRefCount
				set backRefMatch to item i of backRefData
				set backRefRange to backRefMatch's range()
				set backRefPrefix to (replacementTemplate's substringWithRange:(backRefMatch's rangeAtIndex:(1))) as text
				set captureGroupNumber to (replacementTemplate's substringWithRange:(backRefMatch's rangeAtIndex:(2))) as text
				-- Imitate NSString's and NSRegularExpression's interpretation of the passed capture group number.
				if ((count captureGroupNumber) = 1) then -- Single digit.
					set captureGroupNumber to captureGroupNumber as integer
				else if (captureGroupNumber begins with "0") then -- Two digits, first = 0. Treat the second as part of the replacement text.
					set captureGroupNumber to 0
					set backRefRange's |length| to (backRefRange's |length|) - 1
				else -- Two-digit number. If > number of capture groups, treat the second digit as part of the replacement text.
					set captureGroupNumber to captureGroupNumber as integer
					if (captureGroupNumber > numberOfCaptureGroupsInMainRegex) then
						set captureGroupNumber to captureGroupNumber div 10
						set backRefRange's |length| to (backRefRange's |length|) - 1
					end if
				end if
				-- If still > number of capture groups. Set to negative as a signal in the repeat below.
				if (captureGroupNumber > numberOfCaptureGroupsInMainRegex) then set captureGroupNumber to -1
				-- Store the data for this back-reference.
				set item i of backRefData to {backRefRange, backRefPrefix, captureGroupNumber}
			end repeat
		end if
		
		-- Perform the specified substitutions in the main text.
		repeat with i from mainRegexMatchCount to 1 by -1
			set regexMatch to item i of mainRegexMatches
			-- The replacement text for each match of the main regex is derived from a copy of the replacement template.
			set replacementText to replacementTemplate's mutableCopy()
			-- If the template contains any back-references, replace each with the text to which it refers in the current match.
			repeat with j from backRefCount to 1 by -1
				set {backRefRange, backRefPrefix, captureGroupNumber} to item j of backRefData
				if (captureGroupNumber = -1) then -- Original capture group number too high. Just zap the back reference.
					tell replacementText to replaceCharactersInRange:(backRefRange) withString:("")
				else -- Otherwise get the text matched by the capture group and apply any specified case change to it.
					set rangeMatchedByCaptureGroup to (regexMatch's rangeAtIndex:(captureGroupNumber))
					set textMatchedByCaptureGroup to (mainText's substringWithRange:(rangeMatchedByCaptureGroup))
					if (backRefPrefix ends with "U") then
						set textMatchedByCaptureGroup to textMatchedByCaptureGroup's uppercaseString()
					else if (backRefPrefix ends with "L") then
						set textMatchedByCaptureGroup to textMatchedByCaptureGroup's lowercaseString()
					else if (backRefPrefix ends with "I") then
						set textMatchedByCaptureGroup to textMatchedByCaptureGroup's capitalizedString()
					else if (backRefPrefix ends with "F") then
						set character1Length to (textMatchedByCaptureGroup's rangeOfComposedCharacterSequenceAtIndex:(0))'s |length|()
						set initialCapital to (textMatchedByCaptureGroup's substringToIndex:(character1Length))'s uppercaseString()
						set textMatchedByCaptureGroup to (initialCapital's stringByAppendingString:((textMatchedByCaptureGroup's substringFromIndex:(character1Length))'s lowercaseString()))
					end if
					-- Replace the back reference in the replacement template copy with the possibly modified capture group text.
					tell replacementText to replaceCharactersInRange:(backRefRange) withString:(textMatchedByCaptureGroup)
				end if
			end repeat
			-- When all the back references have been so replaced, delete any escapement and replace the match in the main text with the completed replacement string.
			tell replacementText to replaceOccurrencesOfString:("\\\\(.)") withString:("$1") options:(|⌘|'s NSRegularExpressionSearch) range:({0, its |length|()})
			tell mainText to replaceCharactersInRange:(regexMatch's range()) withString:(replacementText)
		end repeat
	end if
	
	-- When all the pattern matches have been replaced, return the edited text.
	return mainText as text
end regexReplace

on demo()
	set txt to "this is soME Text. its cases need SortinG!"
	set regexPattern to "(?<=^|\\.\\s{1,10})(\\w)([^.!?]*+)"
	set replacementTemplate to "$U1$L2"
	-- Or:
	(*
	set regexPattern to "(?<=^|\\.\\s{1,10})([^.!?]*+)"
	set replacementTemplate to "$F1"
	*)
	regexReplace(txt, regexPattern, replacementTemplate)
	--> "This is some text. Its cases need sorting!"
end demo

demo()
1 Like