import scipy.sparse, scipy.sparse.linalg import numpy as np import os, codecs, cPickle, math, sys, pdb fnames = list(os.popen("find proceedings/ -name *.txt -size +0", "r")) fnames.sort() print "processing %d documents" % len(fnames) # to remove punctuation mapping = { ord(','): 0x20, ord(':'): 0x20, ord('!'): 0x20, ord('?'): 0x20, ord('.'): 0x20, ord('['): 0x20, ord(']'): 0x20, ord('('): 0x20, ord(')'): 0x20, }; def isnumber(s): try: float(s) return True except ValueError: return False stage = 1 if stage == 1: # find unique words words = {} for fname in fnames: fname = fname[:-1] print "parsing %s, seen %d unique words so far" % (fname, len(words)) for line in codecs.open(fname, encoding = "utf8"): # remove punctuation line = line.translate(mapping) for word in line.split(): word = word.lower() if isnumber(word) or len(word) <= 2: continue try: words[word] += 1 except KeyError: words[word] = 1 cPickle.dump(words, open("data/words_stats.pickle", "w")) elif stage == 2: # select words based on frequency words = cPickle.load(open("data/words_stats.pickle", "r")) if False: # display most and least frequent words fwords = [(freq, word) for word, freq in words.iteritems()] fwords.sort() fwords.reverse() for freq, word in fwords[:100] + fwords[-100:]: print repr(word), freq sys.exit(0) freq_image = words['image'] word_selection = [word for word, freq in words.iteritems() if 6 < freq <= freq_image and len(word) >= 2 and not isnumber(word)] print "selected %d words" % len(word_selection) cPickle.dump(word_selection, open("data/dictionary.pickle", "w")) elif stage == 3: # constuct the document-word matrix dictionary = cPickle.load(open("data/dictionary.pickle", "r")) # alpha order dictionary.sort() # mapping from dictionary to word number dictionary = dict([(word, no) for no, word in enumerate(dictionary)]) # Dictionary Of Keys based sparse matrix doc_word_matrix = scipy.sparse.dok_matrix((len(fnames), len(dictionary))) print "filling matrix of size", doc_word_matrix.shape for fno, fname in enumerate(fnames): fname = fname[:-1] print fname, len(doc_word_matrix) for line in codecs.open(fname, encoding = "utf8"): line = line.translate(mapping) for word in line.split(): word = word.lower() if word not in dictionary: continue doc_word_matrix[(fno, dictionary[word])] += 1 cPickle.dump(doc_word_matrix, open("data/doc_word_matrix.pickle", "w")) # convert sparse matrix format to something more linear algebra compatible m_csr = scipy.sparse.csr_matrix(doc_word_matrix) cPickle.dump(m_csr, open("data/doc_word_matrix_csr.pickle", "w"), 2)