#!/usr/bin/ruby # Automatically file PDFs by reading them with pdftotext and putting them in the right folders by # group/year/folder. require "fileutils" # Encapsulates the location information. class LocInfo # The folder to file inside attr_reader :filing_folder # An optional "top most" folder, to subgroup things (use it to file for more than a single individual) attr_reader :topmost_folder # An optional filename suffix (use it to add things like the account number for multiple accounts on the same folder) attr_reader :filename_suffix def initialize(filing_folder, topmost_folder = nil, filename_suffix = nil) @filing_folder = filing_folder @topmost_folder = topmost_folder @filename_suffix = filename_suffix end def to_s return "#{@filing_folder}" end end # Encapsulates the instructions for renaming and unifies path name generation. class RenameInstructions attr_accessor :source_filename attr_accessor :document_date, :folder, :filename, :extension attr_accessor :locinfo def target_path f = target_folder() if (f != nil) then result = "#{target_folder}/#{@filename}" else result = "#{@filename}" end if ( @locinfo != nil && @locinfo.topmost_folder != nil ) then result = "#{@locinfo.topmost_folder}/#{result}" end if ( @locinfo != nil && @locinfo.filename_suffix != nil ) then result = "#{result}#{@locinfo.filename_suffix}" end if ( @extension != nil ) then result = "#{result}#{@extension}" end return result end def target_folder result = "#{@document_date.year}" if ( @folder != nil) then result = "#{result}/#{@folder}" end return result end def to_s return "#{@source_filename} => #{target_path}" end end #### FROM http://aswembar.net/jed/dateInfo.html # This class is meant to try to figure out what date is # represented by a passed in string. It heavily uses regular # expressions to determine this, so it may not be passed, but # it gives a lot of freedom in allowing someone to enter a # date easily. I chose not to throw errors, instead giving # nil results for dates that can not be understood (strings that # do not represent a valid date.) class DateInfo def initialize end # It takes as input a string. If it determines that the string specifies a date, # it returns that date as a Time object. Otherwise it returns null. It sees the following # types of strings as dates: # 1. Of the format MM/DD/YY, MM-DD-YY, MM\DD\YY. YY may be YYYY in any of the cases. # This is the trickiest pattern, since the user may format using Year/Month/Day or # Month/Day/Year. In addition, only two digits for the year makes it difficult to # assume what the user meant. I chose to use the Month/Day/Year form, unless the # user uses four digits to specify the year, in which point it is easy to figure out # what they meant. Therefore, YYYY/MM/DD is also a valid format . As for the year, if # four digits are not specified, then I assume that they are specifying the current # millineum (2000). # 2. Of the format "June 27, 1983", "Jun 27, 1983", or "June 27" (in which the current # year is implied). Granted Junileropwf 27, 1983 would also be valid here, but I # didn't think such cases were important enough to detect and were instead a waste the # readibility of the expression. In specifying the month, a minimum of 3 characters # are required. The comma is optional, but at least one space must separate the tokens. # 3. Of the format YYYYMMDD. # 4. Of the format YYYY. The date is specified as Jan 1 of the year YYYY. # 5. Last or Next or Current Day of the Week (ie last Thu, next Thursday, or Thursday). # I used a minimum of three letters for the weekday name to avoid matching cases # I didn't want it to match. # 6. Yesterday, today, tomorrow. def interpretDate( input ) input = input.to_s.downcase t = nil currentDate = Time.local(Time.now.year, Time.now.mon, Time.now.day) if( input =~ /\A(\d+)\s*(-|\/|\\)\s*(\d+)\s*((-|\/|\\)\s*(\d+))?/ ) month = $1.to_i day = $3.to_i if( $6 ) year = $6.to_i else year = Time.new.year end t = Time.local( year, month, day ) elsif( input =~ /\A(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[^\s]* (\d+)(.*)/ ) month = $1 day = $2.to_i if ( $3 =~ /\s*(\,|)\s*(\d\d\d\d)/ ) year = $2.to_i else year = Time.new.year end t = Time.gm( year, month, day ) elsif( input =~ /\A(\d\d\d\d)(\d\d)(\d\d)\Z/ ) month = $2.to_i day = $3.to_i year = $1.to_i t = Time.local( year, month, day ) elsif( input =~ /\A(\d\d\d\d)\Z/ ) year = $1.to_i t = Time.local( year, 1, 1 ) elsif( input =~ /(this |last |next |)(sun|mon|tue|wed|thu|fri|sat)[^\s]*/ ) t = currentDate wdays = %w(sun mon tue wed thu fri sat) if( $1 == "last " ) #subtract one week from today t -= (86400*7) elsif( $1 == "next " ) #add one week to today t += (86400*7) end distance = wdays.index( $2 ) - currentDate.wday distance += 7 if ( distance < 0 ) t += (distance*86400) elsif( input =~ /today/ ) t = Time.now elsif( input =~ /tomorrow/ ) t = Time.now + 86400 elsif( input =~ /yesterday/ ) t = Time.now - 86400 else return nil end return t end #If input is not given, the input is the today #Returns the start of the month specified by input as a # string in the format mm/dd/yy #Returns nil if the input does not represent a valid date def startMonth( input = nil ) if( input == nil ) t = Time.local(Time.now.year, Time.now.mon, Time.now.day) else t = interpretDate( input ) end return nil if( t == nil ) stringDate = t.strftime("%m/01/%y") return stringDate end #If input is not given, the input is the today #Returns the end of the month specified by input as a # string in the format mm/dd/yy #Returns nil if the input does not represent a valid date def endMonth( input = nil ) if( input == nil ) t = Time.local(Time.now.year, Time.now.mon, Time.now.day) else t = interpretDate( input ) end return nil if( t == nil ) curMonth = t.strftime("%m").to_i curYear = t.strftime("%y").to_i if( curMonth == 4 || curMonth == 6 || curMonth == 9 || curMonth == 11 ) stringDate = t.strftime("%m/30/%y") elsif( curMonth == 2 && Date.gregorian_leap?( curYear ) ) stringDate = t.strftime("%m/29/%y") elsif( curMonth == 2 && !Date.gregorian_leap?( curYear ) ) stringDate = t.strftime("%m/28/%y") else stringDate = t.strftime("%m/31/%y") end return stringDate end #If input is not given, the date is today #If input represents a valid date, returns the date as an integer (YYYYMMDD) #and as a string (MM/DD/YY) #Returns nil, nil if the input does not represent a valid date def getFormattedDates( input= nil ) if( input == nil ) t = Time.local(Time.now.year, Time.now.mon, Time.now.day) else t = interpretDate( input ) end return nil, nil if( t == nil ) # Print out a report for this date intDate = t.strftime("%Y%m%d").to_i stringDate = t.strftime("%m/%d/%y") return intDate, stringDate end #Returns the number of days between start and end date #Returns nil if startdate or enddate is invalid def getDaysBetween( startDate, endDate ) startDate = interpretDate( startDate ) endDate = interpretDate( endDate ) return nil if( startDate == nil || endDate == nil ) return (((endDate-startDate)/86400).to_i+1) end # It takes as input a string. If it determines that the string specifies a date, # the method returns a string with the date, the day of the week of the date, and the number # of days away the date is from today. It sees the following types of strings as dates: def dateReporter( input ) if( input == nil ) t = Time.local(Time.now.year, Time.now.mon, Time.now.day) else t = interpretDate( input ) end return "That Date is no Recognized" if( t == nil ) currentDate = Time.local(Time.now.year, Time.now.mon, Time.now.day) # Get information related to the specified date distance = ((t-currentDate)/86400).to_i # Print out a report for this date report = ( "You asked about "+t.strftime("%B %d, %Y")+"\n" ) report += ( "This date is a "+t.strftime("%A")+".\n" ) if( distance == 0 ) report += ( "This is today's date." ) elsif( distance == 1 ) report += ( "This is tomorrow's date." ) elsif( distance == -1 ) report += ( "This was yesterday's date." ) elsif( distance > 0 ) report += ( "This date will occur in "+distance.to_s+" days." ) elsif( distance < 0 ) report += ( "This date occured "+(distance*-1).to_s+" days ago." ) end return report end end # Does the filename determination. class FileNameDetermination attr_reader :rename_instructions def initialize(directory) @directory = directory @rename_instructions = Array.new # The master map of renaming rules. Basically "If you find this regex in the PDF, put it here" @@unique_regexes = { /1234567890/ => LocInfo.new('Bank of Nowhere Home Loan'), # Anything unique to the type of document works, like the doctor's name or just your cable provider name /Thomas Kojack/i => LocInfo.new('Richard Dr. Kojak'), /Comcast/i => LocInfo.new("Comcast"), # Here is how you can do sub-filing (like for another person, say your grandmother) /Bank of Some Big Place/i => LocInfo.new('Some Big Place Bank', 'Grandma'), } # Some regexes to match dates @re_generic_mdy_date = /\A(\d+)(-|\/|\\)(\d+)((-|\/|\\)(\d+))?/ end def determine_filename puts "Dir: #{@directory}" files = Dir.entries(@directory) filename_pattern = /.*\.pdf/ files.each { |filename| if ( ! File.directory?(filename) && filename_pattern.match(filename) ) then cmdline = "/opt/local/bin/pdftotext '#{@directory}/#{filename}' \-" # puts "Analyzing #{filename} - running #{cmdline}" pipe = IO.popen(cmdline, "r") contents = pipe.readlines instructions = RenameInstructions.new instructions.source_filename = filename instructions.extension = ".pdf" # Find out the folder first, by piping through the known regexes on the entire document content (pipe separated instead of newline separated) @@unique_regexes.each { |regex, locinfo| if ( regex.match(contents.join("|")) ) then instructions.locinfo = locinfo instructions.folder = locinfo.filing_folder instructions.source_filename = filename break end } pipe.close if ( $?.exitstatus > 0 ) then puts "Skipping #{@directory}/#{filename} because pdftotext failed" break end # Now try to find some file details for the filename (dates and such) contents.each { |line| line.split(' ').each { |entry| if ( @re_generic_mdy_date.match(entry) ) then if ( has_significant_info?(line) ) then date_info = DateInfo.new target_date = nil begin target_date = date_info.interpretDate(entry) rescue end if ( target_date != nil ) then instructions.document_date = target_date # puts "Date: #{target_date} in line #{line}" end end end } } if (instructions.document_date == nil) then # No way to find the document date, use the creation time. instructions.document_date = File.ctime("#{@directory}/#{@filename}") end if (instructions.document_date != nil) then instructions.filename = instructions.document_date.strftime("%Y-%m-%d") else instructions.filename = instructions.source_filename.split('.')[0] end @rename_instructions.push(instructions) puts "Analyzing: #{instructions.source_filename}" end } # Now walk through all the rename instructions and do the actual renames. puts "Performing file moves.." # Move the source filenames to a pattern composed with the new file. Make parent dir if it's not there. @rename_instructions.each { | instructions | i=1 original_filename = instructions.filename while ( File.exists?(instructions.target_path) ) do i = i + 1 instructions.filename = "#{original_filename}-#{i}" end # puts "Rename Instructions: #{instructions}" target_folder = File.dirname(instructions.target_path) if ( ! File.exists?(target_folder) ) then FileUtils.mkdir_p(target_folder) # puts "Will create #{target_folder}" end puts "#{instructions.source_filename} -> #{instructions.target_path}" FileUtils.mv("#{@directory}/#{instructions.source_filename}", instructions.target_path) } end def has_significant_info?(line) # The stuff that should NOT match if /\d{5}/.match(line) or /.*1[\- ]*(800|888|866).*/.match(line) or /.*(program|in line|CUSTOMER SERVICE|Previous Bal|BOX|Wireless|Numbers|Family).*/.match(line) or /.*INTEREST FROM.*/.match(line) then return false end if /.*Covers.*/.match(line) or /.*Through.*/.match(line) or /.*Opening.*Closing Date.*/.match(line) or /.*Pay Date.*/.match(line) or /.*Paymend Due Date.*/.match(line) then return true end return true end end determination = FileNameDetermination.new(ARGV[0]) determination.determine_filename