Module pymake.util.vocabulary

Source code
import os, re
from string import punctuation
from pymake.util.utils import basestring


import numpy as np
import scipy as sp


class Vocabulary(object):

    recover_list = {"wa":"was", "ha":"has"}

    regex = {
        'word' : re.compile(r'[a-zA-Z0-9_\-]+'),
        'not_word' : re.compile(r'[^a-zA-Z0-9_\-]+'),
        'ponctuation' : re.compile(r'[\.\,\;\:\!\?\(\)\{\}\[\]\'\"\`]+'),
    }

    def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            stopwords_list = set(stopwords_list)
        self.stopwords_list = stopwords_list

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !')


    def is_stopword(self, w):
        return w in self.stopwords_list

    def lemmatize(self, w0):
        if not hasattr(self, 'wlemm'):
            #self.log.debug()
            print('No lematization')
            return w0
        w = self.wlemm.lemmatize(w0.lower())
        if w in self.recover_list: return self.recover_list[w]
        return w

    def token2id(self):
        return self.token2id

    def id2token(self):
        if hasattr(self, '_id2token') and len(self.token2id) == len(self._id2token):
            return self._id2token
        else:
            self._id2token = dict((v,k) for k, v in self.token2id.iteritems())
            return self._id2token

    def term_to_id(self, term0):
        term = self.lemmatize(term0)
        if not self.regex['word'].match(term): return None
        if self.exclude_stopwords and self.is_stopword(term): return None
        try:
            term_id = self.token2id[term]
        except:
            term_id = len(self.vocas)
            self.token2id[term] = term_id
            self.vocas.append(term)
            self.docfreq.append(0)
        return term_id

    def remove_stopwords(self, doc):
        if isinstance(doc, basestring):
            doc = doc.split()
        elif doc is None:
            return None
        else:
            doc = doc
        return ' '.join([w for w in doc if not self.is_stopword(w)]).strip()

    # Bag of words !
    def doc2bow(self, doc):
        l = dict()
        words = dict()
        doc = self.regex['ponctuation'].sub(' ', doc).split() if isinstance(doc, basestring) else doc
        for term in doc:
            id = self.term_to_id(term)
            if id != None:
                l[id] = l.get(id, 0) + 1
                if not id in words:
                    words[id] = 1
                    self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
        if "close" in dir(doc): doc.close()
        return sorted(l.items())

    def cut_low_freq(self, corpus, threshold=1):
        new_vocas = []
        new_docfreq = []
        self.token2id = dict()
        conv_map = dict()
        for id, term in enumerate(self.vocas):
            freq = self.docfreq[id]
            if freq > threshold:
                new_id = len(new_vocas)
                self.token2id[term] = new_id
                new_vocas.append(term)
                new_docfreq.append(freq)
                conv_map[id] = new_id
        self.vocas = new_vocas
        self.docfreq = new_docfreq

        def conv(doc):
            new_doc = []
            for id in doc:
                if id in conv_map: new_doc.append(conv_map[id])
            return new_doc
        return [conv(doc) for doc in corpus]

    def __getitem__(self, v):
        return self.vocas[v]

    def size(self):
        return len(self.vocas)

    def is_stopword_id(self, id):
        return self.vocas[id] in self.stopwords_list

    # if string in input
    #   * return a corpus as list of list of string.
    # Else
    #   * return a numpy array of cooc-matrix
    @classmethod
    def parse_corpus(cls, fname, bdir=""):
        dico = None
        if type(fname) is str:
            if not os.path.exists(fname): raise EnvironmentError('%s ?' % fname)
            if os.path.isdir(fname):
                import fnmatch
                bdir = fname
                corpus_files = []
                dico_files = []
                for root, dirnames, filenames in os.walk(bdir):
                    for filename in fnmatch.filter(filenames, '*.txt'):
                        if filename.startswith(('dico.','vocab.')):
                            dico_files.append(os.path.join(root, filename))
                        else:
                            corpus_files.append(os.path.join(root, filename))

                if len(corpus_files) == 1:
                    # Parse sparse matrix:  DOC_ID WORD_ID COUNT
                    with open(corpus_files[0],'r') as f:
                        # Read the header information
                        n_instances = int(f.readline())
                        n_features = int(f.readline())
                        n_nnz = int(f.readline())

                        # Create cooc-matrix
                        #data = sp.sparse.csr_matrix((n_instances, n_features))
                        data = sp.sparse.lil_matrix((n_instances, n_features), dtype=int)
                        for line in f:
                            doc_id, word_id, count = list(map(int, line.split()))
                            data[doc_id-1, word_id-1] = count
                        data = data.tocsr()
                    # If dictionnary, return it
                    with open(dico_files[0],'r') as f:
                        dico = {}
                        for i, line in enumerate(f):
                            word = line.split()
                            assert(len(word) == 1)
                            #dict((v,k) for k, v in self.token2id.iteritems())
                            dico[i] = word[0]
                else:
                    # Parse documents as each of them is a .txt file
                    data = [cls.parse_document_f(f) for f in corpus_files]

            #elif os.path.isfile(fname): ?
        elif type(fname) is list:
            # List of string as document
            docs = fname
            data = [cls.parse_document_l(d) for d in docs]
        else:
            raise NotImplementedError('file input: %s' % fname)

        return data, dico

    @classmethod
    def parse_document_f(cls, d):
        return [word.lower() for line in open(d).readlines() for word in cls.regex['not_word'].sub(' ', line).split()]

    @classmethod
    def parse_document_l(cls, d):
        return [word.lower() for word in cls.regex['not_word'].sub(' ', d).split()]

Classes

class Vocabulary (exclude_stopwords=False, lemmatize=True)
Source code
class Vocabulary(object):

    recover_list = {"wa":"was", "ha":"has"}

    regex = {
        'word' : re.compile(r'[a-zA-Z0-9_\-]+'),
        'not_word' : re.compile(r'[^a-zA-Z0-9_\-]+'),
        'ponctuation' : re.compile(r'[\.\,\;\:\!\?\(\)\{\}\[\]\'\"\`]+'),
    }

    def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            stopwords_list = set(stopwords_list)
        self.stopwords_list = stopwords_list

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !')


    def is_stopword(self, w):
        return w in self.stopwords_list

    def lemmatize(self, w0):
        if not hasattr(self, 'wlemm'):
            #self.log.debug()
            print('No lematization')
            return w0
        w = self.wlemm.lemmatize(w0.lower())
        if w in self.recover_list: return self.recover_list[w]
        return w

    def token2id(self):
        return self.token2id

    def id2token(self):
        if hasattr(self, '_id2token') and len(self.token2id) == len(self._id2token):
            return self._id2token
        else:
            self._id2token = dict((v,k) for k, v in self.token2id.iteritems())
            return self._id2token

    def term_to_id(self, term0):
        term = self.lemmatize(term0)
        if not self.regex['word'].match(term): return None
        if self.exclude_stopwords and self.is_stopword(term): return None
        try:
            term_id = self.token2id[term]
        except:
            term_id = len(self.vocas)
            self.token2id[term] = term_id
            self.vocas.append(term)
            self.docfreq.append(0)
        return term_id

    def remove_stopwords(self, doc):
        if isinstance(doc, basestring):
            doc = doc.split()
        elif doc is None:
            return None
        else:
            doc = doc
        return ' '.join([w for w in doc if not self.is_stopword(w)]).strip()

    # Bag of words !
    def doc2bow(self, doc):
        l = dict()
        words = dict()
        doc = self.regex['ponctuation'].sub(' ', doc).split() if isinstance(doc, basestring) else doc
        for term in doc:
            id = self.term_to_id(term)
            if id != None:
                l[id] = l.get(id, 0) + 1
                if not id in words:
                    words[id] = 1
                    self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
        if "close" in dir(doc): doc.close()
        return sorted(l.items())

    def cut_low_freq(self, corpus, threshold=1):
        new_vocas = []
        new_docfreq = []
        self.token2id = dict()
        conv_map = dict()
        for id, term in enumerate(self.vocas):
            freq = self.docfreq[id]
            if freq > threshold:
                new_id = len(new_vocas)
                self.token2id[term] = new_id
                new_vocas.append(term)
                new_docfreq.append(freq)
                conv_map[id] = new_id
        self.vocas = new_vocas
        self.docfreq = new_docfreq

        def conv(doc):
            new_doc = []
            for id in doc:
                if id in conv_map: new_doc.append(conv_map[id])
            return new_doc
        return [conv(doc) for doc in corpus]

    def __getitem__(self, v):
        return self.vocas[v]

    def size(self):
        return len(self.vocas)

    def is_stopword_id(self, id):
        return self.vocas[id] in self.stopwords_list

    # if string in input
    #   * return a corpus as list of list of string.
    # Else
    #   * return a numpy array of cooc-matrix
    @classmethod
    def parse_corpus(cls, fname, bdir=""):
        dico = None
        if type(fname) is str:
            if not os.path.exists(fname): raise EnvironmentError('%s ?' % fname)
            if os.path.isdir(fname):
                import fnmatch
                bdir = fname
                corpus_files = []
                dico_files = []
                for root, dirnames, filenames in os.walk(bdir):
                    for filename in fnmatch.filter(filenames, '*.txt'):
                        if filename.startswith(('dico.','vocab.')):
                            dico_files.append(os.path.join(root, filename))
                        else:
                            corpus_files.append(os.path.join(root, filename))

                if len(corpus_files) == 1:
                    # Parse sparse matrix:  DOC_ID WORD_ID COUNT
                    with open(corpus_files[0],'r') as f:
                        # Read the header information
                        n_instances = int(f.readline())
                        n_features = int(f.readline())
                        n_nnz = int(f.readline())

                        # Create cooc-matrix
                        #data = sp.sparse.csr_matrix((n_instances, n_features))
                        data = sp.sparse.lil_matrix((n_instances, n_features), dtype=int)
                        for line in f:
                            doc_id, word_id, count = list(map(int, line.split()))
                            data[doc_id-1, word_id-1] = count
                        data = data.tocsr()
                    # If dictionnary, return it
                    with open(dico_files[0],'r') as f:
                        dico = {}
                        for i, line in enumerate(f):
                            word = line.split()
                            assert(len(word) == 1)
                            #dict((v,k) for k, v in self.token2id.iteritems())
                            dico[i] = word[0]
                else:
                    # Parse documents as each of them is a .txt file
                    data = [cls.parse_document_f(f) for f in corpus_files]

            #elif os.path.isfile(fname): ?
        elif type(fname) is list:
            # List of string as document
            docs = fname
            data = [cls.parse_document_l(d) for d in docs]
        else:
            raise NotImplementedError('file input: %s' % fname)

        return data, dico

    @classmethod
    def parse_document_f(cls, d):
        return [word.lower() for line in open(d).readlines() for word in cls.regex['not_word'].sub(' ', line).split()]

    @classmethod
    def parse_document_l(cls, d):
        return [word.lower() for word in cls.regex['not_word'].sub(' ', d).split()]

Class variables

var recover_list
var regex

Static methods

def parse_corpus(fname, bdir='')
Source code
@classmethod
def parse_corpus(cls, fname, bdir=""):
    dico = None
    if type(fname) is str:
        if not os.path.exists(fname): raise EnvironmentError('%s ?' % fname)
        if os.path.isdir(fname):
            import fnmatch
            bdir = fname
            corpus_files = []
            dico_files = []
            for root, dirnames, filenames in os.walk(bdir):
                for filename in fnmatch.filter(filenames, '*.txt'):
                    if filename.startswith(('dico.','vocab.')):
                        dico_files.append(os.path.join(root, filename))
                    else:
                        corpus_files.append(os.path.join(root, filename))

            if len(corpus_files) == 1:
                # Parse sparse matrix:  DOC_ID WORD_ID COUNT
                with open(corpus_files[0],'r') as f:
                    # Read the header information
                    n_instances = int(f.readline())
                    n_features = int(f.readline())
                    n_nnz = int(f.readline())

                    # Create cooc-matrix
                    #data = sp.sparse.csr_matrix((n_instances, n_features))
                    data = sp.sparse.lil_matrix((n_instances, n_features), dtype=int)
                    for line in f:
                        doc_id, word_id, count = list(map(int, line.split()))
                        data[doc_id-1, word_id-1] = count
                    data = data.tocsr()
                # If dictionnary, return it
                with open(dico_files[0],'r') as f:
                    dico = {}
                    for i, line in enumerate(f):
                        word = line.split()
                        assert(len(word) == 1)
                        #dict((v,k) for k, v in self.token2id.iteritems())
                        dico[i] = word[0]
            else:
                # Parse documents as each of them is a .txt file
                data = [cls.parse_document_f(f) for f in corpus_files]

        #elif os.path.isfile(fname): ?
    elif type(fname) is list:
        # List of string as document
        docs = fname
        data = [cls.parse_document_l(d) for d in docs]
    else:
        raise NotImplementedError('file input: %s' % fname)

    return data, dico
def parse_document_f(d)
Source code
@classmethod
def parse_document_f(cls, d):
    return [word.lower() for line in open(d).readlines() for word in cls.regex['not_word'].sub(' ', line).split()]
def parse_document_l(d)
Source code
@classmethod
def parse_document_l(cls, d):
    return [word.lower() for word in cls.regex['not_word'].sub(' ', d).split()]

Methods

def cut_low_freq(self, corpus, threshold=1)
Source code
def cut_low_freq(self, corpus, threshold=1):
    new_vocas = []
    new_docfreq = []
    self.token2id = dict()
    conv_map = dict()
    for id, term in enumerate(self.vocas):
        freq = self.docfreq[id]
        if freq > threshold:
            new_id = len(new_vocas)
            self.token2id[term] = new_id
            new_vocas.append(term)
            new_docfreq.append(freq)
            conv_map[id] = new_id
    self.vocas = new_vocas
    self.docfreq = new_docfreq

    def conv(doc):
        new_doc = []
        for id in doc:
            if id in conv_map: new_doc.append(conv_map[id])
        return new_doc
    return [conv(doc) for doc in corpus]
def doc2bow(self, doc)
Source code
def doc2bow(self, doc):
    l = dict()
    words = dict()
    doc = self.regex['ponctuation'].sub(' ', doc).split() if isinstance(doc, basestring) else doc
    for term in doc:
        id = self.term_to_id(term)
        if id != None:
            l[id] = l.get(id, 0) + 1
            if not id in words:
                words[id] = 1
                self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
    if "close" in dir(doc): doc.close()
    return sorted(l.items())
def id2token(self)
Source code
def id2token(self):
    if hasattr(self, '_id2token') and len(self.token2id) == len(self._id2token):
        return self._id2token
    else:
        self._id2token = dict((v,k) for k, v in self.token2id.iteritems())
        return self._id2token
def is_stopword(self, w)
Source code
def is_stopword(self, w):
    return w in self.stopwords_list
def is_stopword_id(self, id)
Source code
def is_stopword_id(self, id):
    return self.vocas[id] in self.stopwords_list
def lemmatize(self, w0)
Source code
def lemmatize(self, w0):
    if not hasattr(self, 'wlemm'):
        #self.log.debug()
        print('No lematization')
        return w0
    w = self.wlemm.lemmatize(w0.lower())
    if w in self.recover_list: return self.recover_list[w]
    return w
def remove_stopwords(self, doc)
Source code
def remove_stopwords(self, doc):
    if isinstance(doc, basestring):
        doc = doc.split()
    elif doc is None:
        return None
    else:
        doc = doc
    return ' '.join([w for w in doc if not self.is_stopword(w)]).strip()
def size(self)
Source code
def size(self):
    return len(self.vocas)
def term_to_id(self, term0)
Source code
def term_to_id(self, term0):
    term = self.lemmatize(term0)
    if not self.regex['word'].match(term): return None
    if self.exclude_stopwords and self.is_stopword(term): return None
    try:
        term_id = self.token2id[term]
    except:
        term_id = len(self.vocas)
        self.token2id[term] = term_id
        self.vocas.append(term)
        self.docfreq.append(0)
    return term_id
def token2id(self)
Source code
def token2id(self):
    return self.token2id