Araxis Find Dup Files - Count dups by tuple

I’ve been meaning to share forward for a while.
This script reads the duplicate list created by Araxis Find Dup Files app and lists the number of dups in each folder tuple. I use this to find folders that have nearly duplicate contents. It includes a heap sort that can be customized to use different compare rules.


property value_test : missing value

-- This reads the Duplicates Report and counts the number of files for every pair or n-tuple of partent folders.

set tab_char to ASCII character 9
set lf_char to ASCII character 10
set duplicates_file_path_name to (choose file with prompt "Pick the Araxis Duplicates List to use") as string
set duplicates_file_id to open for access file duplicates_file_path_name
set log_file_path to (path to desktop as string) & "Folders with dups from Araxis.txt"
set table_list to {}
set file_id to open for access file log_file_path with write permission
write "Start " & (current date) & return & return to file_id starting at eof
set current_size to -1
set result_set to {}
set name_tuple to missing value
set type_count_set to {}
set name_tuple to missing value
try
	repeat
		set temp_text to read (duplicates_file_id as text) until lf_char as «class utf8»
		if (count of characters in temp_text) > 0 then
			set file_path_name to text 1 thru -2 of temp_text
			set AppleScript's text item delimiters to "/"
			set file_name to text item -1 of file_path_name
			set AppleScript's text item delimiters to "."
			set file_type to text item -1 of file_name
			set AppleScript's text item delimiters to ""
			if file_type = "DS_Store" then
				--beep
			else if file_type = "BridgeSort" then
				--beep
			else if file_type begins with "Icon" then
				--beep
			else if file_type = "BridgeLabelsAndRatings" then
				--beep
			else if file_type = "textClipping" then
				--beep
			else
				set existing_record to false
				repeat with curr_record in type_count_set
					if file_type of curr_record = file_type then
						set existing_record to true
						set type_count of curr_record to (type_count of curr_record) + 1
						exit repeat
					end if
				end repeat
				if not existing_record then
					set end of type_count_set to {file_type:file_type, type_count:1}
				end if
				set AppleScript's text item delimiters to "/"
				set folder_path_name to text items 3 thru -2 of file_path_name
				set file_path_name to text items 3 thru -1 of file_path_name
				set AppleScript's text item delimiters to ":"
				set file_path_name to file_path_name as string
				set folder_path_name to folder_path_name as string
				set AppleScript's text item delimiters to ""
				tell application "Finder" to set file_size to size of file file_path_name
				if file_size ≠ current_size then
					my increment_tuple_count(name_tuple, result_set)
					set name_tuple to missing value
					set current_size to file_size
				end if
				set name_tuple to my add_to_tuple(folder_path_name, name_tuple)
			end if
			
		end if
	end repeat
on error number -39 -- eof
end try
close access duplicates_file_id
repeat with curr_record in type_count_set
	write (((type_count of curr_record) as string) & tab_char & ((file_type of curr_record) as string) & return) to file_id starting at eof
end repeat
set list_count to count of items of result_set
write (list_count as string) & " Folder sets found" & return & return to file_id starting at eof
if list_count > 0 then
	set value_test to get_first_item_in_name_tuple
	set result_set to my heap_sort(result_set)
	repeat with curr_record in result_set
		if tuple_count of curr_record = 1 then
			write ((tuple_count of curr_record) as string) & return to file_id starting at eof
			repeat with folder_name in name_tuple of curr_record
				tell application "Finder" to set folder_count to count of files in folder folder_name
				tell application "Finder" to set folder_size to size of folder folder_name
				write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
			end repeat
			write return to file_id starting at eof
		end if
	end repeat
	repeat with curr_record in result_set
		if tuple_count of curr_record = 2 then
			write ((tuple_count of curr_record) as string) & return to file_id starting at eof
			repeat with folder_name in name_tuple of curr_record
				tell application "Finder" to set folder_count to count of files in folder folder_name
				tell application "Finder" to set folder_size to size of folder folder_name
				write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
			end repeat
			write return to file_id starting at eof
		end if
	end repeat
	repeat with curr_record in result_set
		if tuple_count of curr_record > 2 then
			write ((tuple_count of curr_record) as string) & return to file_id starting at eof
			repeat with folder_name in name_tuple of curr_record
				tell application "Finder" to set folder_count to count of files in folder folder_name
				tell application "Finder" to set folder_size to size of folder folder_name
				write (folder_count as string) & (ASCII character 9) & (folder_size as string) & (ASCII character 9) & folder_name & return to file_id starting at eof
			end repeat
			write return to file_id starting at eof
		end if
	end repeat
else
	write "No dups found" & return to file_id starting at eof
end if
write "End " & (current date) & return to file_id starting at eof
close access file_id
say "finished"
return

on add_to_tuple(location_name, name_tuple)
	if name_tuple is missing value then
		set name_tuple to {location_name}
	else
		set list_count to count of items of name_tuple
		set end of name_tuple to location_name
		repeat with list_index from 1 to list_count
			if item (list_count + 1 - list_index) of name_tuple > location_name then
				set item (list_count + 2 - list_index) of name_tuple to item (list_count + 1 - list_index) of name_tuple
				if list_index = list_count then
					set item 1 of name_tuple to location_name
				end if
			else
				set item (list_count + 2 - list_index) of name_tuple to location_name
				exit repeat
			end if
		end repeat
	end if
	return name_tuple
end add_to_tuple

on increment_tuple_count(name_tuple, result_set)
	if name_tuple is not missing value then
		set existing_record to false
		repeat with curr_record in result_set
			if name_tuple of curr_record = name_tuple then
				set existing_record to true
				set tuple_count of curr_record to (tuple_count of curr_record) + 1
				exit repeat
			end if
		end repeat
		if not existing_record then
			set end of result_set to {name_tuple:name_tuple, tuple_count:1}
		end if
	end if
end increment_tuple_count

on heap_sort(set_to_sort)
	set element_count to count of set_to_sort
	log element_count
	if element_count > 2 then
		set mid_point to (element_count / 2) as integer
		set set_a to my heap_sort(items 1 thru mid_point of set_to_sort)
		set set_b to my heap_sort(items (mid_point + 1) thru element_count of set_to_sort)
		set finished_set to my merge_sets(set_a, set_b)
	else
		if (count of items in set_to_sort) = 2 then
			tell value_test to if value_to_test(1, set_to_sort) > value_to_test(2, set_to_sort) then
				set temp_item to item 1 of set_to_sort
				set item 1 of set_to_sort to item 2 of set_to_sort
				set item 2 of set_to_sort to temp_item
			end if
		end if
		set finished_set to set_to_sort
	end if
	log "  new list"
	repeat with current_item in finished_set
		log "   " & (item 1 of name_tuple of current_item) as string
	end repeat
	log "   end merge" & return & "     "
	return finished_set
end heap_sort

on merge_sets(set_a, set_b)
	set current_a to 1
	set current_b to 1
	set last_a to count of items of set_a
	set last_b to count of items of set_b
	set merged_set to {}
	repeat while current_a ≤ last_a and current_b ≤ last_b
		tell value_test to if value_to_test(current_a, set_a) < value_to_test(current_b, set_b) then
			set end of merged_set to item current_a of set_a
			set current_a to current_a + 1
		else
			set end of merged_set to item current_b of set_b
			set current_b to current_b + 1
		end if
	end repeat
	repeat while current_a ≤ last_a
		set end of merged_set to item current_a of set_a
		set current_a to current_a + 1
	end repeat
	repeat while current_b ≤ last_b
		set end of merged_set to item current_b of set_b
		set current_b to current_b + 1
	end repeat
	return merged_set
end merge_sets

script get_first_item_in_name_tuple
	on value_to_test(curr_index, set_to_test)
		return item 1 of name_tuple of item curr_index of set_to_test
	end value_to_test
end script