/* Generate the document frequencies */
 group_term_frequencies = group term_frequencies by term;
  
 doc_term_count = foreach group_term_frequencies generate FLATTEN(term_frequencies) as (file_name, term, term_freq), COUNT_STAR(term_frequencies) as doc_freq;
  
  
 /* Generate the doc count in the corpus */
 doc_groups = foreach (group file_and_sentence by file_name) generate group as file_name;
  
 doc_count = foreach(group doc_groups all) generate COUNT(doc_groups) as n_docs;
  
  
 /* Generate the final tf-idf scores */
 scores =  foreach doc_term_count generate file_name as file_name, term as term, term_freq * LOG((double)doc_count.n_docs/(double)doc_freq) as tf_idf;
  
 ordered_scores = order scores by tf_idf ;
 
 
