Hi all,
I have to list which each contain a list of words with how many times that word is present in a text file. Each list is quite long, > 3000 items.
Basically (in a simplified format) I have:
ListA ListB
Bill: 4 Bill: 4
Coen: 2 Coen:1
Mark: 1 Mark: 3
Steve:4 Stefan: 2
Tom: 3 Tom: 3
I am trying to get a “differential” list. Which it should remove things which are “equal” and only leave the differences, or unique items (of each list). Something like this:
ListDelta
Coen: 1
Mark: -2
Stefan: 2
Steve: 4
I wrote a complicated (and slow script), which I am attaching below, you don’t have to read it. Of one loops inside an other loop.
Basically, the scripts:
- loads the to lists
- if there are item in List A with exactly the same word and the same count than in ListB, then it delete this item from both lists (with the aim of reducing the list size)
- if item in ListA is present in list b but different “counts”, then ir remove from LikstA and update the count in ListB to the difference of the counts.
- Anything left in ListA, is unique and it is copied in ListB.
- ListB is now the difference on ListA versus ListB.
I wonder if you have a faster way to solve my problem.
Thanks in advances (and sorry for the long post).
L.
property p_the_path : missing value
property p_newFileName : missing value
set theSourceList to (choose file with prompt "Select a file to import the List1:")
open for access theSourceList
set fileContents to (read theSourceList)
close access theSourceList
set theList1 to every paragraph of fileContents
set theSourceList2 to (choose file with prompt "Select a file to import the List2:")
open for access theSourceList2
set fileContents2 to (read theSourceList2)
close access theSourceList2
set theList2 to every paragraph of fileContents2
# let's prepare the name for saving the DeltaList into a file
tell (info for theSourceList) to set {Nm, Ex} to {name, name extension}
set OnlyName to text 1 thru ((get offset of "." & Ex in Nm) - 1) of Nm
-- display dialog (Nm as text) & "_._" & (Ex as text)
tell (info for theSourceList2) to set {Nm2, Ex2} to {name, name extension}
set OnlyName2 to text 1 thru ((get offset of "." & Ex2 in Nm2) - 1) of Nm2
-- display dialog (Nm2 as text) & "_._" & (Ex2 as text)
set newFileName to Nm & "_" & Nm2
set p_newFileName to newFileName
set deltaList to {}
set the_totalCountOfList1 to 0
set the_totalCountOfList2 to 0
repeat with a from length of theList1 to 1 by -1 -- removing partial duplicate
set theCurrentListItem to item a of theList1
-- Process the current list item
-- display dialog theCurrentListItem & " is item " & a & " in the list."
set the_offset to offset of " " in theCurrentListItem -- we look for the " " (space) char in each line
set the_Word to text 1 thru the_offset of theCurrentListItem
set the_Count to text (the_offset + 1) thru -1 of theCurrentListItem
set the_totalCountOfList1 to the_totalCountOfList1 + the_Count -- we are counting the total number of words
set lenghtOfList2 to length of theList2
repeat with z from lenghtOfList2 to 1 by -1
set theCurrentListItem2 to item z of theList2
if theCurrentListItem2 starts with the_Word then
-- display dialog theCurrentListItem2 & " is into List2!"
set the_offset2 to offset of " " in theCurrentListItem2 -- we look for the " " (space) char in each line.
set the_Count2 to text (the_offset2 + 1) thru -1 of theCurrentListItem2
set the_totalCountOfList2 to the_totalCountOfList2 + the_Count2 -- we are counting the total number of words
set DeltaCount to (the_Count as integer) - (the_Count2 as integer)
if DeltaCount < 0 then
set DeltaCount to -DeltaCount -- we dont want negative values
set item z of theList2 to the_Word & DeltaCount
set theList1 to remove_item_from_list(theList1, a)
exit repeat -- it quits !!!
end if
if DeltaCount = 0 then
set theList2 to remove_item_from_list(theList2, z)
set theList1 to remove_item_from_list(theList1, a)
exit repeat
end if
if DeltaCount > 0 then
set item z of theList2 to the_Word & DeltaCount
set theList1 to remove_item_from_list(theList1, a)
exit repeat
end if
end if
end repeat
end repeat
set deltaList to theList1 & theList2
# we need to save de DeltaFile
set the_path to (":Users:ldicroce:Dropbox (CRG):Programming:Counting Words:Text files:" & p_newFileName)
set p_the_path to the_path
set deltaList to Export_list(deltaList, (length of deltaList))
display dialog the_totalCountOfList1 as string -- this is the total number of words of List1
display dialog the_totalCountOfList2 as string -- this is the total number of words of List2
on Export_list(the_list, NumItems)
-- set myExportedList to open for access (choose file name) with write permission
set myExportedList to open for access file p_the_path with write permission
repeat with itemNum from 1 to NumItems
write ((item itemNum of the_list as string) & return) to myExportedList -- MAYBE I SHOULD INCLUDE "write the_text to file_ref as «class utf8» "
end repeat
close access myExportedList
end Export_list
on remove_item_from_list(the_list, index_to_remove)
if index_to_remove = 1 then
set new_list to rest of the_list
else if index_to_remove = length of the_list then
set new_list to (items 1 thru -2 of the_list)
else
set new_list to (items 1 thru (index_to_remove - 1) of the_list) & (items (index_to_remove + 1) thru -1 of the_list)
end if
return new_list
end remove_item_from_list