import scipy.sparse, scipy.sparse.linalg
import numpy as np
import os, codecs, cPickle, math, sys, pdb

fnames = list(os.popen("find proceedings/ -name *.txt -size +0", "r"))
fnames.sort()

print "processing %d documents" % len(fnames)

# to remove punctuation
mapping = {
    ord(','): 0x20,
    ord(':'): 0x20,
    ord('!'): 0x20,
    ord('?'): 0x20,
    ord('.'): 0x20,
    ord('['): 0x20,
    ord(']'): 0x20,
    ord('('): 0x20,
    ord(')'): 0x20,
    };

def isnumber(s): 
    try:
        float(s)
        return True
    except ValueError: 
        return False

stage = 1

if stage == 1: 
    # find unique words
    words = {}

    for fname in fnames:
        fname = fname[:-1]
        print "parsing %s, seen %d unique words so far" % (fname, len(words))
        for line in codecs.open(fname, encoding = "utf8"): 
          # remove punctuation  
          line = line.translate(mapping)

          for word in line.split(): 
              word = word.lower()
              if isnumber(word) or len(word) <= 2: continue
              try:
                  words[word] += 1
              except KeyError: 
                  words[word] = 1
    
    cPickle.dump(words, open("data/words_stats.pickle", "w"))
elif stage == 2:

    # select words based on frequency
    words = cPickle.load(open("data/words_stats.pickle", "r"))

    if False:
        # display most and least frequent words
        
        fwords = [(freq, word) for word, freq in words.iteritems()] 
    
        fwords.sort()
        fwords.reverse()
        for freq, word in fwords[:100] + fwords[-100:]: 
            print repr(word), freq

        sys.exit(0)
    
    freq_image = words['image']

    word_selection = [word for word, freq in words.iteritems() if 
                      6 < freq <= freq_image and len(word) >= 2 and
                      not isnumber(word)]

    print "selected %d words" % len(word_selection)

    cPickle.dump(word_selection, open("data/dictionary.pickle", "w"))
    
elif stage == 3: 
    # constuct the document-word matrix

    dictionary = cPickle.load(open("data/dictionary.pickle", "r"))
    
    # alpha order
    dictionary.sort()

    # mapping from dictionary to word number
    dictionary = dict([(word, no) for no, word in enumerate(dictionary)])
    
    # Dictionary Of Keys based sparse matrix
    doc_word_matrix = scipy.sparse.dok_matrix((len(fnames), len(dictionary)))
    print "filling matrix of size", doc_word_matrix.shape
    
    for fno, fname in enumerate(fnames):
        fname = fname[:-1]
        print fname, len(doc_word_matrix)
        for line in codecs.open(fname, encoding = "utf8"): 
          line = line.translate(mapping)

          for word in line.split(): 
              word = word.lower()
              if word not in dictionary: continue
              doc_word_matrix[(fno, dictionary[word])] += 1

    cPickle.dump(doc_word_matrix, open("data/doc_word_matrix.pickle", "w"))

    # convert sparse matrix format to something more linear algebra compatible
    m_csr = scipy.sparse.csr_matrix(doc_word_matrix)
    cPickle.dump(m_csr, open("data/doc_word_matrix_csr.pickle", "w"), 2)