#!/local/bin/perl # # Read in transcript text, extract spoken words (not inside parentheses or # brackets), and sort by number of occurrences. # Final output includes historical hourly averages for each word. # # Copyright 2000-2008 by Curtis C. Chen # #---------------------------------------- # Range guidelines #---------------------------------------- # COMMON: 7/hr+ # UNCOMMON: 3/hr...6/hr # RARE: 1/hr...2/hr # NAMES: extracted manually (9+ total count, or studio/company/nominee) # undef $/; # ignore line separators $debug=0; #in hours $show_length=4; #------------------------------------------------------------ @datafile = ( 'speeches_1999', 'speeches_2000', 'speeches_2001', 'speeches_2002', 'speeches_2003', 'speeches_2004', 'speeches_2006', 'speeches_2007', ); # NOTE: 2005 is missing because I changed to using the broadcast year # (i.e., the awards for 2006 are announced in early 2007). @wordlist=(); %wordhash=(); # per-hour counts $common_min = 7; $uncommon_min = 3; $rare_min = 1; #filenames $common_file = 'COMMON_RAW'; $uncommon_file = 'UNCOMMON_RAW'; $rare_file = 'RARE_RAW'; $names_file = 'NAMES_RAW'; #------------------------------------------------------------ $rawbuf=$buf=''; # get transcript text foreach $df (@datafile) { open (INFILE,"$df"); $rawbuf .= ; close (INFILE); } # kill extra formatting @chunky = split /\n\s*\n/,$rawbuf; foreach $chunk (@chunky) { $debug && print STDERR "CHUNK: \*$chunk\*\n"; #remove parentheticals and other non-spoken notations $chunk =~ s/\([^\)]*\)//g; $chunk =~ s/\[[^\]]*\]//g; ## #we only want DIALOGUE (screenplay format) ## if ($chunk =~ /^([A-Z]+\s*)+(\n\s*)+\S+/) { ## #remove SPEAKER heading (screenplay format) ## $chunk =~ s/^([A-Z]+\s*)+\n//; #squeeze blank lines $chunk =~ s/(\n\s*\n)+/ /g; #remove normal punctuation $chunk =~ s/(\w)[\'\"\.\,\?\!\;\:\-]+(\s+)/$1$2/g; #remove non-spaced punctuation $chunk =~ s/(\w)[\'\,\-]+(\w)/$1 $2/g; #add extra spaces as buffer! $buf .= " $chunk "; ## } } $debug && print STDERR $buf; # split into single words @wordlist = split /\s+/, $buf; #RFE: Text::ParseWords ? - "Programming Perl" chapter 7 #RFE: count whether same word is capitalized more often than not and use the more common version for display/name sensing! #RFE: look for plurals, roll into singular word count # 1st hash: word->count foreach $word (@wordlist) { # strip leading and trailing garbage $word =~ s/^\W+//g; $word =~ s/\W+$//g; # kill contractions unless ( $word =~ /n\'t$/i ) { $word =~ s/\'\w\w?$//i; } # check for non-proper noun form, or vice versa if ($word =~ /^[A-Z]/) { $drow = lcfirst $word; } else { $drow = ucfirst $word; } # fuzzy matching forms $expanded = $singular = $word; if ($word !~ /s$/i) { $plural = $word . 's'; } $expanded =~ s/\w\'t//i; $singular =~ s/s$//i; # (same for proper/non) $expanded_p = $singular_p = $drow; if ($drow !~ /s$/i) { $plural_p = $drow . 's'; } $expanded_p =~ s/\w\'t//i; $singular_p =~ s/s$//i; # only count words! # NOTE: first occurrence of word is displayed, though all forms (see above) are counted #RFE: use "switch" instead of "else" for better readability if ($word=~/\w+/ && $word!~/^[\d,]*$/) { if ($wordhash{$word}) { $wordhash{$word}++; } elsif ($wordhash{$drow}) { $wordhash{$drow}++; } elsif ($wordhash{$singular}) { $wordhash{$singular}++; } elsif ($wordhash{$singular_p}) { $wordhash{$singular_p}++; } elsif ($wordhash{$plural}) { $wordhash{$plural}++; } elsif ($wordhash{$plural_p}) { $wordhash{$plural_p}++; } elsif ($wordhash{$expanded}) { $wordhash{$expanded}++; } elsif ($wordhash{$expanded_p}) { $wordhash{$expanded_p}++; } else { $wordhash{$word}=1; } } } #RFE: weight older years' word counts more heavily, so that names of #people who won big recently (and are not likely to repeat) don't show up #so much the next year # print results # open output files open(CFILE,"+>$common_file"); open(UFILE,"+>$uncommon_file"); open(RFILE,"+>$rare_file"); open(NFILE,"+>$names_file"); $seen='|'; foreach $wordcount (reverse sort asnum values %wordhash) { if ($seen !~ /\|$wordcount\|/) { # calculate averages $years = scalar @datafile; $avg_yr = int($wordcount/$years); $avg_hr = int($avg_yr/$show_length); print STDOUT "$wordcount TOTAL\n$avg_yr\/YEAR\n$avg_hr\/HR\:\n"; @temp=(); foreach $finalword (keys %wordhash) { if ( ($wordhash{$finalword}==$wordcount) && (! grep /^$finalword$/i, @temp) ) { push @temp, $finalword; } } # make bare word list $buf=''; foreach $hardcopy (sort nocase @temp) { $buf = $buf."\t".$hardcopy."\n"; } # print to file if ($wordcount >= $common_min) { print CFILE "$buf\n"; } elsif ($wordcount < $common_min && $wordcount >= $uncommon_min) { print UFILE "$buf\n"; } elsif ($wordcount < $uncommon_min && $wordcount >= $rare_min) { print RFILE "$buf\n"; } else { # this should actually be blank print NFILE "$buf\n"; } # print to console print STDOUT "$buf\n"; } $seen .= "$wordcount|"; } close(CFILE); close(UFILE); close(RFILE); close(NFILE); #------------------------------------------------------------ sub asnum { return (scalar($a)<=>scalar($b)); } sub nocase { return lc($a) cmp lc($b); } __END__