Module pymake.frontend.frontendtext

Source code
import sys, os
from collections import defaultdict
from string import Template

from .frontend import DataBase
from pymake.util.vocabulary import Vocabulary



class frontendText(DataBase):
    """ Frontend for text Corpus """

    def __init__(self, expe=None):
        super(frontendText, self).__init__(expe)
        self._data_type = 'text'

    def load_data(self, randomize=False):
        """ Load data according to different scheme:
            * Corpus from file dataset
            * Corpus from random generator
            """

        corpus_name = self.corpus_name
        self.get_corpus(corpus_name)

        # @DEBUG
        if str(self.N).isdigit() and int(self.N) > self.data.shape[0]:
            raise ValueError('Sampling of corpus %s too big (-n options)' % self.N)

        if randomize:
            self.shuffle_docs()
        return self.data

    def make_testset(self, ratio):
        self.log.warning('check WHY and WHEN overflow in stirling matrix !?')
        self.log.warning('debug why error and i get walue superior to 6000 in the striling matrix ????')
        D = self.data.shape[0]
        d = int(D * ratio)
        data = self.data[:d]
        data_t = self.data[d:]
        return data, data_t

    # @Debug: remove docu for wich count > 6000 because of stirling matrix !
    def sample(self, N=None, **args):
        N = N or self.N
        n = self.data.shape[0]
        if not N or N == 'all':
            self.N = 'all'
            # To remove !
            if  self.corpus_name == '20ngroups':
                data = self.data[:10000]
                empty_words =  np.where(data.sum(0).A[0] == 0)[0]
                new_cols = np.delete(np.arange(data.shape[1]), empty_words)
                self.data = data[:, new_cols]

        else:
            N = int(N)
            data = self.data[:N]
            # Here we come to streaming problem !
            # @DEBUG manage id2word
            if type(data) is np.array:
                empty_words =  np.where(data.sum(0) == 0)[0]
                self.data = np.delete(data, empty_words, axis=1)
            elif data.format == 'csr':
                empty_words =  np.where(data.sum(0).A[0] == 0)[0]
                new_cols = np.delete(np.arange(data.shape[1]), empty_words)
                self.data = data[:, new_cols]

        # @debug to remove
        _l =  (self.data >= 6000).sum(1).A.T[0]
        print(_l)
        tt = self.data[_l > 0]
        for t in tt:
            print(t[t>=6000])
        self.data = self.data[ _l == 0 ]
        return self.data

    ### Get and prepropress text
    #   See Vocabulary class...
    #   * Tokenisation from scratch
    #   * Stop Word from scratch
    #   * Lemmatization from Wornet
    #   * Load or Save in a Gensim context
    #       - Load has priority over Save
    # @Debug: There is a convertion to gensim corpus to use the serialization library and then back to scipy corpus.
    #   Can be avoided by using our own library of serialization, using Gensim if needed ?!
    def textloader(self, target, bdir=None, corpus_name="", n=None):
        if type(target) is str and os.path.isfile(target):
            bdir = os.path.dirname(target)
        elif bdir is None:
            bdir = self.basedir
        fn = 'corpus'
        if n:
            fn += str(n)
        elif type(target) is not str:
            n = len(target)
            fn += str(n)

        if corpus_name:
            fname = bdir + '/'+fn+'_' + corpus_name + '.mm'
        else:
            fname = bdir + '/'+fn+'.mm'

        if self._load_data and os.path.isfile(fname):
            data = gensim.corpora.MmCorpus(fname)
            data = gensim.matutils.corpus2csc(data, dtype=int).T
            id2word = dict(gensim.corpora.dictionary.Dictionary.load_from_text(fname + '.dico'))
        else:
            prin('re-Building Corpus...')
            raw_data, id2word = Vocabulary.parse_corpus(target)

            # Corpus will be in bag of words format !
            if type(raw_data) is list:
                voca = Vocabulary(exclude_stopwords=True)
                data = [voca.doc2bow(doc) for doc in raw_data]
                data = gensim.matutils.corpus2csc(data, dtype=int).T # Would be faster with #doc #term #nnz
            else:
                data = raw_data

            if self._save_data:
                make_path(bdir)
                _data = gensim.matutils.Sparse2Corpus(data, documents_columns=False)
                voca_gensim = gensim.corpora.dictionary.Dictionary.from_corpus(_data, id2word)
                voca_gensim.save_as_text(fname+'.dico')
                gensim.corpora.MmCorpus.serialize(fname=fname, corpus=_data)
                #@Debug how to get the corpus from list of list ?
                #_data = gensim.corpora.MmCorpus(fname)

        return data, id2word

    def get_corpus(self, corpus_name):
        self.make_io_path()
        bdir = self.basedir
        data_t = None
        data_t = None
        if corpus_name == 'random':
            # mmmh speak !
            data = self.random()
        if corpus_name == 'lucene':
            raise NotImplementedError
            #searcher = warm_se(config)
            #q = config.get('q'); q['limit'] = config['limit_train']
            #id2word = searcher.get_id2word()
            #corpus = searcher.self.parse_corpus(q, vsm=config['vsm'], chunk=1000, batch=True)
        elif corpus_name == '20ngroups_sklearn':
            from sklearn.datasets import fetch_20newsgroups
            ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
            ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
            train_data = ngroup_train.data
            test_data = ngroup_test.data
            #corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train', n=config.get('N'))
            corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train')
            corpus_t, id2word_t = self.textloader(test_data, bdir=bdir, corpus_name='test')

            K = self.K
            #################
            ### Group Control
            test_classes = ngroup_test.target
            train_classes = ngroup_train.target
            if K == 6 and len(ngroup_test.target_names) != 6:
                # Wrap to subgroups
                target_names = ['comp', 'misc', 'rec', 'sci', 'talk', 'soc']
                map_ = dict([(0,5), (1,0), (2,0), (3,0), (4,0), (5,0), (6,1), (7,2), (8,2), (9,2), (10,2), (11,3), (12,3), (13,3), (14,3), (15,5), (16,4), (17,4), (18,4), (19,5)])
                test_classes = set_v_to(test_classes, map_)
                train_classes = set_v_to(train_classes, map_)
            else:
                target_names = ngroup_test.target_names
            C = len(target_names)

        elif corpus_name == 'wikipedia':
            # ? file type
            # Create
            command = './gensim/gensim/scripts/make_wikicorpus_ml.py '
            command += '/work/data/wikipedia/enwiki-latest-pages-articles.xml.bz2 ../pymake/data/wikipedia/wiki_en'
            os.system(command)
            # Load
            error = 'Load Wikipedia corpus'
            raise NotImplementedError(error)
        elif corpus_name == 'odp':
            # SVMlight file type
            from sklearn.datasets import load_svmlight_files, load_svmlight_file
            fn_train = os.path.join(bdir, 'train.txt')
            fn_test = os.path.join(bdir, 'test.txt')
            # More feature in test than train !!!
            data, train_classes = load_svmlight_file(fn_train)
            data_t, test_classes = load_svmlight_file(fn_test)
            id2word = None
        elif corpus_name in ('reuter50', 'nips12', 'nips', 'enron', 'kos', 'nytimes', 'pubmed') or corpus_name == '20ngroups' :
            # DOC_ID FEAT_ID COUNT file type
            data, id2word = self.textloader(bdir, corpus_name=corpus_name)
        else:
            raise ValueError('Which corpus to Load ?')

        self.data = data
        self.id2word = id2word
        if data_t is None:
            pass
            #raise NotImplementedError('Corpus test ?')
        else:
            self.data_t = data_t

        return True

    def get_data_prop(self):
        prop = defaultdict()
        prop.update( {'corpus': self.corpus_name,
                      'instances' : self.data.shape[1] })
        nnz = self.data.sum()
        _nnz = self.data.sum(axis=1)
        dct = {'features': self.data.shape[1],
               'nnz': nnz,
               'nnz_mean': _nnz.mean(),
               'nnz_var': _nnz.var(),
               'train_size': None,
               'test_size': None,
              }
        prop.update(dct)
        return prop

    def template(self, dct):
        text_templ = '''###### $corpus_name
        Building: $time minutes
        Documents: $instances
        Nnz: $nnz
        Nnz mean: $nnz_mean
        Nnz var: $nnz_var
        Vocabulary: $features
        train: $train_size
        test: $test_size
        \n'''
        return Template(templ).substitute(dct)

    def print_vocab(self, data, id2word):
        if id2word:
            return gensim.corpora.dictionary.Dictionary.from_corpus(data, id2word) #; print voca

    def shuffle_docs(self):
        self.shuffle_instances()

    # Return a vector with document generated from a count matrix.
    # Assume sparse matrix
    @staticmethod
    def sparse2stream(data):
        #new_data = []
        #for d in data:
        #    new_data.append(d[d.nonzero()].A1)
        bow = []
        for doc in data:
            # Also, see collections.Counter.elements() ...
            bow.append( np.array(list(chain(* [ doc[0,i]*[i] for i in doc.nonzero()[1] ]))))
        bow = np.array(bow)
        #map(np.random.shuffle, bow)
        return bow


    # Debug
    def run_lda(self):
        pass
   #     # Cross Validation settings...
   #     #@DEBUG: do we need to remake the vocabulary ??? id2word would impact the topic word distribution ?
   #     if corpus_t is None:
   #         pass
   #         #take 80-20 %
   #         # remake vocab and shape !!!
   #         # manage downside
   #     try:
   #         total_corpus = len(corpus)
   #         total_corpus_t = len(corpus_t)
   #     except:
   #         total_corpus = corpus.shape[0]
   #         total_corpus_t = corpus.shape[0]
   #     if config.get('N'):
   #         N = config['N']
   #     else:
   #         N = total_corpus
   #     corpus = corpus[:N]
   #     n_percent = float(N) / total_corpus
   #     n_percent = int(n_percent * total_corpus_t) or 10
   #     heldout_corpus = corpus_t[:n_percent]

   #     ############
   #     ### Load LDA
   #     load = config['load_model']
   #     # Path for LDA model!
   #     bdir = '../PyNPB/data/'
   #     bdir = os.path.join(bdir,config.get('corpus'), config.get('bdir', ''))
   #     lda = lda_gensim(corpus, id2word=id2word, K=K, bdir=bdir, load=load, model=config['model'], alpha=config['hyper'], n=config['N'], heldout_corpus=heldout_corpus)
   #     lda.inference_time = datetime.now() - last_d
   #     last_d = ellapsed_time('LDA Inference -- '+config['model'], last_d)

   #     ##############
   #     ### Log Output
   #     lda.print_topics(K)

   #     ##############
   #     ### Prediction
   #     corpus_t = corpus
   #     if config['predict'] and true_classes is not None and C == K:
   #         true_classes = train_classes
   #         predict_class = []
   #         confusion_mat = np.zeros((K,C))
   #         startt = datetime.now()
   #         for i, d in enumerate(corpus_t):
   #             d_t = lda.get_document_topics(d, minimum_probability=0.01)
   #             t = max(d_t, key=lambda item:item[1])[0]
   #             predict_class.append(t)
   #             c = true_classes[i]
   #             confusion_mat[t, c] += 1
   #         last_d = ellapsed_time('LDA Prediction', startt)
   #         predict_class = np.array(predict_class)
   #         lda.confusion_matrix = confusion_mat

   #         map_kc = map_class2cluster_from_confusion(confusion_mat)
   #         #new_predict_class = set_v_to(predict_class, dict(map_kc))

   #         print "Confusion Matrix, KxC:"
   #         print confusion_mat
   #         print map_kc
   #         print [(k, target_names[c]) for k,c in map_kc]

   #         purity = confusion_mat.max(axis=1).sum() / len(corpus_t)
   #         print 'Purity (K=%s, C=%s, D=%s): %s' % (K, C, len(corpus_t), purity)

   #         #precision = np.sum(new_predict_class == true_classes) / float(len(predict_class)) # equal !!!
   #         precision = np.sum(confusion_mat[zip(*map_kc)]) / float(len(corpus_t))
   #         print 'Ratio Groups Control: %s' % (precision)

   #     if save:
   #         ## Too big
   #         lda.expElogbeta = None
   #         lda.sstats = None
   #         lda.save(lda.fname)

   #     if config.get('_verbose'):
   #         #print lda.top_topics(corpus)
   #         for d in corpus:
   #             print lda.get_document_topics(d, minimum_probability=0.01)

   #     print lda
   #     if type(corpus) is not list:
   #         print corpus
   #         print corpus_t
   #     self.print_vocab(corpus, id2word)

Classes

class frontendText (expe=None)

Frontend for text Corpus

Source code
class frontendText(DataBase):
    """ Frontend for text Corpus """

    def __init__(self, expe=None):
        super(frontendText, self).__init__(expe)
        self._data_type = 'text'

    def load_data(self, randomize=False):
        """ Load data according to different scheme:
            * Corpus from file dataset
            * Corpus from random generator
            """

        corpus_name = self.corpus_name
        self.get_corpus(corpus_name)

        # @DEBUG
        if str(self.N).isdigit() and int(self.N) > self.data.shape[0]:
            raise ValueError('Sampling of corpus %s too big (-n options)' % self.N)

        if randomize:
            self.shuffle_docs()
        return self.data

    def make_testset(self, ratio):
        self.log.warning('check WHY and WHEN overflow in stirling matrix !?')
        self.log.warning('debug why error and i get walue superior to 6000 in the striling matrix ????')
        D = self.data.shape[0]
        d = int(D * ratio)
        data = self.data[:d]
        data_t = self.data[d:]
        return data, data_t

    # @Debug: remove docu for wich count > 6000 because of stirling matrix !
    def sample(self, N=None, **args):
        N = N or self.N
        n = self.data.shape[0]
        if not N or N == 'all':
            self.N = 'all'
            # To remove !
            if  self.corpus_name == '20ngroups':
                data = self.data[:10000]
                empty_words =  np.where(data.sum(0).A[0] == 0)[0]
                new_cols = np.delete(np.arange(data.shape[1]), empty_words)
                self.data = data[:, new_cols]

        else:
            N = int(N)
            data = self.data[:N]
            # Here we come to streaming problem !
            # @DEBUG manage id2word
            if type(data) is np.array:
                empty_words =  np.where(data.sum(0) == 0)[0]
                self.data = np.delete(data, empty_words, axis=1)
            elif data.format == 'csr':
                empty_words =  np.where(data.sum(0).A[0] == 0)[0]
                new_cols = np.delete(np.arange(data.shape[1]), empty_words)
                self.data = data[:, new_cols]

        # @debug to remove
        _l =  (self.data >= 6000).sum(1).A.T[0]
        print(_l)
        tt = self.data[_l > 0]
        for t in tt:
            print(t[t>=6000])
        self.data = self.data[ _l == 0 ]
        return self.data

    ### Get and prepropress text
    #   See Vocabulary class...
    #   * Tokenisation from scratch
    #   * Stop Word from scratch
    #   * Lemmatization from Wornet
    #   * Load or Save in a Gensim context
    #       - Load has priority over Save
    # @Debug: There is a convertion to gensim corpus to use the serialization library and then back to scipy corpus.
    #   Can be avoided by using our own library of serialization, using Gensim if needed ?!
    def textloader(self, target, bdir=None, corpus_name="", n=None):
        if type(target) is str and os.path.isfile(target):
            bdir = os.path.dirname(target)
        elif bdir is None:
            bdir = self.basedir
        fn = 'corpus'
        if n:
            fn += str(n)
        elif type(target) is not str:
            n = len(target)
            fn += str(n)

        if corpus_name:
            fname = bdir + '/'+fn+'_' + corpus_name + '.mm'
        else:
            fname = bdir + '/'+fn+'.mm'

        if self._load_data and os.path.isfile(fname):
            data = gensim.corpora.MmCorpus(fname)
            data = gensim.matutils.corpus2csc(data, dtype=int).T
            id2word = dict(gensim.corpora.dictionary.Dictionary.load_from_text(fname + '.dico'))
        else:
            prin('re-Building Corpus...')
            raw_data, id2word = Vocabulary.parse_corpus(target)

            # Corpus will be in bag of words format !
            if type(raw_data) is list:
                voca = Vocabulary(exclude_stopwords=True)
                data = [voca.doc2bow(doc) for doc in raw_data]
                data = gensim.matutils.corpus2csc(data, dtype=int).T # Would be faster with #doc #term #nnz
            else:
                data = raw_data

            if self._save_data:
                make_path(bdir)
                _data = gensim.matutils.Sparse2Corpus(data, documents_columns=False)
                voca_gensim = gensim.corpora.dictionary.Dictionary.from_corpus(_data, id2word)
                voca_gensim.save_as_text(fname+'.dico')
                gensim.corpora.MmCorpus.serialize(fname=fname, corpus=_data)
                #@Debug how to get the corpus from list of list ?
                #_data = gensim.corpora.MmCorpus(fname)

        return data, id2word

    def get_corpus(self, corpus_name):
        self.make_io_path()
        bdir = self.basedir
        data_t = None
        data_t = None
        if corpus_name == 'random':
            # mmmh speak !
            data = self.random()
        if corpus_name == 'lucene':
            raise NotImplementedError
            #searcher = warm_se(config)
            #q = config.get('q'); q['limit'] = config['limit_train']
            #id2word = searcher.get_id2word()
            #corpus = searcher.self.parse_corpus(q, vsm=config['vsm'], chunk=1000, batch=True)
        elif corpus_name == '20ngroups_sklearn':
            from sklearn.datasets import fetch_20newsgroups
            ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
            ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
            train_data = ngroup_train.data
            test_data = ngroup_test.data
            #corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train', n=config.get('N'))
            corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train')
            corpus_t, id2word_t = self.textloader(test_data, bdir=bdir, corpus_name='test')

            K = self.K
            #################
            ### Group Control
            test_classes = ngroup_test.target
            train_classes = ngroup_train.target
            if K == 6 and len(ngroup_test.target_names) != 6:
                # Wrap to subgroups
                target_names = ['comp', 'misc', 'rec', 'sci', 'talk', 'soc']
                map_ = dict([(0,5), (1,0), (2,0), (3,0), (4,0), (5,0), (6,1), (7,2), (8,2), (9,2), (10,2), (11,3), (12,3), (13,3), (14,3), (15,5), (16,4), (17,4), (18,4), (19,5)])
                test_classes = set_v_to(test_classes, map_)
                train_classes = set_v_to(train_classes, map_)
            else:
                target_names = ngroup_test.target_names
            C = len(target_names)

        elif corpus_name == 'wikipedia':
            # ? file type
            # Create
            command = './gensim/gensim/scripts/make_wikicorpus_ml.py '
            command += '/work/data/wikipedia/enwiki-latest-pages-articles.xml.bz2 ../pymake/data/wikipedia/wiki_en'
            os.system(command)
            # Load
            error = 'Load Wikipedia corpus'
            raise NotImplementedError(error)
        elif corpus_name == 'odp':
            # SVMlight file type
            from sklearn.datasets import load_svmlight_files, load_svmlight_file
            fn_train = os.path.join(bdir, 'train.txt')
            fn_test = os.path.join(bdir, 'test.txt')
            # More feature in test than train !!!
            data, train_classes = load_svmlight_file(fn_train)
            data_t, test_classes = load_svmlight_file(fn_test)
            id2word = None
        elif corpus_name in ('reuter50', 'nips12', 'nips', 'enron', 'kos', 'nytimes', 'pubmed') or corpus_name == '20ngroups' :
            # DOC_ID FEAT_ID COUNT file type
            data, id2word = self.textloader(bdir, corpus_name=corpus_name)
        else:
            raise ValueError('Which corpus to Load ?')

        self.data = data
        self.id2word = id2word
        if data_t is None:
            pass
            #raise NotImplementedError('Corpus test ?')
        else:
            self.data_t = data_t

        return True

    def get_data_prop(self):
        prop = defaultdict()
        prop.update( {'corpus': self.corpus_name,
                      'instances' : self.data.shape[1] })
        nnz = self.data.sum()
        _nnz = self.data.sum(axis=1)
        dct = {'features': self.data.shape[1],
               'nnz': nnz,
               'nnz_mean': _nnz.mean(),
               'nnz_var': _nnz.var(),
               'train_size': None,
               'test_size': None,
              }
        prop.update(dct)
        return prop

    def template(self, dct):
        text_templ = '''###### $corpus_name
        Building: $time minutes
        Documents: $instances
        Nnz: $nnz
        Nnz mean: $nnz_mean
        Nnz var: $nnz_var
        Vocabulary: $features
        train: $train_size
        test: $test_size
        \n'''
        return Template(templ).substitute(dct)

    def print_vocab(self, data, id2word):
        if id2word:
            return gensim.corpora.dictionary.Dictionary.from_corpus(data, id2word) #; print voca

    def shuffle_docs(self):
        self.shuffle_instances()

    # Return a vector with document generated from a count matrix.
    # Assume sparse matrix
    @staticmethod
    def sparse2stream(data):
        #new_data = []
        #for d in data:
        #    new_data.append(d[d.nonzero()].A1)
        bow = []
        for doc in data:
            # Also, see collections.Counter.elements() ...
            bow.append( np.array(list(chain(* [ doc[0,i]*[i] for i in doc.nonzero()[1] ]))))
        bow = np.array(bow)
        #map(np.random.shuffle, bow)
        return bow


    # Debug
    def run_lda(self):
        pass

Ancestors

Static methods

def sparse2stream(data)
Source code
@staticmethod
def sparse2stream(data):
    #new_data = []
    #for d in data:
    #    new_data.append(d[d.nonzero()].A1)
    bow = []
    for doc in data:
        # Also, see collections.Counter.elements() ...
        bow.append( np.array(list(chain(* [ doc[0,i]*[i] for i in doc.nonzero()[1] ]))))
    bow = np.array(bow)
    #map(np.random.shuffle, bow)
    return bow

Methods

def get_corpus(self, corpus_name)
Source code
def get_corpus(self, corpus_name):
    self.make_io_path()
    bdir = self.basedir
    data_t = None
    data_t = None
    if corpus_name == 'random':
        # mmmh speak !
        data = self.random()
    if corpus_name == 'lucene':
        raise NotImplementedError
        #searcher = warm_se(config)
        #q = config.get('q'); q['limit'] = config['limit_train']
        #id2word = searcher.get_id2word()
        #corpus = searcher.self.parse_corpus(q, vsm=config['vsm'], chunk=1000, batch=True)
    elif corpus_name == '20ngroups_sklearn':
        from sklearn.datasets import fetch_20newsgroups
        ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
        ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
        train_data = ngroup_train.data
        test_data = ngroup_test.data
        #corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train', n=config.get('N'))
        corpus, id2word = self.textloader(train_data, bdir=bdir, corpus_name='train')
        corpus_t, id2word_t = self.textloader(test_data, bdir=bdir, corpus_name='test')

        K = self.K
        #################
        ### Group Control
        test_classes = ngroup_test.target
        train_classes = ngroup_train.target
        if K == 6 and len(ngroup_test.target_names) != 6:
            # Wrap to subgroups
            target_names = ['comp', 'misc', 'rec', 'sci', 'talk', 'soc']
            map_ = dict([(0,5), (1,0), (2,0), (3,0), (4,0), (5,0), (6,1), (7,2), (8,2), (9,2), (10,2), (11,3), (12,3), (13,3), (14,3), (15,5), (16,4), (17,4), (18,4), (19,5)])
            test_classes = set_v_to(test_classes, map_)
            train_classes = set_v_to(train_classes, map_)
        else:
            target_names = ngroup_test.target_names
        C = len(target_names)

    elif corpus_name == 'wikipedia':
        # ? file type
        # Create
        command = './gensim/gensim/scripts/make_wikicorpus_ml.py '
        command += '/work/data/wikipedia/enwiki-latest-pages-articles.xml.bz2 ../pymake/data/wikipedia/wiki_en'
        os.system(command)
        # Load
        error = 'Load Wikipedia corpus'
        raise NotImplementedError(error)
    elif corpus_name == 'odp':
        # SVMlight file type
        from sklearn.datasets import load_svmlight_files, load_svmlight_file
        fn_train = os.path.join(bdir, 'train.txt')
        fn_test = os.path.join(bdir, 'test.txt')
        # More feature in test than train !!!
        data, train_classes = load_svmlight_file(fn_train)
        data_t, test_classes = load_svmlight_file(fn_test)
        id2word = None
    elif corpus_name in ('reuter50', 'nips12', 'nips', 'enron', 'kos', 'nytimes', 'pubmed') or corpus_name == '20ngroups' :
        # DOC_ID FEAT_ID COUNT file type
        data, id2word = self.textloader(bdir, corpus_name=corpus_name)
    else:
        raise ValueError('Which corpus to Load ?')

    self.data = data
    self.id2word = id2word
    if data_t is None:
        pass
        #raise NotImplementedError('Corpus test ?')
    else:
        self.data_t = data_t

    return True
def get_data_prop(self)
Source code
def get_data_prop(self):
    prop = defaultdict()
    prop.update( {'corpus': self.corpus_name,
                  'instances' : self.data.shape[1] })
    nnz = self.data.sum()
    _nnz = self.data.sum(axis=1)
    dct = {'features': self.data.shape[1],
           'nnz': nnz,
           'nnz_mean': _nnz.mean(),
           'nnz_var': _nnz.var(),
           'train_size': None,
           'test_size': None,
          }
    prop.update(dct)
    return prop
def load_data(self, randomize=False)

Load data according to different scheme: * Corpus from file dataset * Corpus from random generator

Source code
def load_data(self, randomize=False):
    """ Load data according to different scheme:
        * Corpus from file dataset
        * Corpus from random generator
        """

    corpus_name = self.corpus_name
    self.get_corpus(corpus_name)

    # @DEBUG
    if str(self.N).isdigit() and int(self.N) > self.data.shape[0]:
        raise ValueError('Sampling of corpus %s too big (-n options)' % self.N)

    if randomize:
        self.shuffle_docs()
    return self.data
def make_testset(self, ratio)
Source code
def make_testset(self, ratio):
    self.log.warning('check WHY and WHEN overflow in stirling matrix !?')
    self.log.warning('debug why error and i get walue superior to 6000 in the striling matrix ????')
    D = self.data.shape[0]
    d = int(D * ratio)
    data = self.data[:d]
    data_t = self.data[d:]
    return data, data_t
def print_vocab(self, data, id2word)
Source code
def print_vocab(self, data, id2word):
    if id2word:
        return gensim.corpora.dictionary.Dictionary.from_corpus(data, id2word) #; print voca
def run_lda(self)
Source code
def run_lda(self):
    pass
def sample(self, N=None, **args)
Source code
def sample(self, N=None, **args):
    N = N or self.N
    n = self.data.shape[0]
    if not N or N == 'all':
        self.N = 'all'
        # To remove !
        if  self.corpus_name == '20ngroups':
            data = self.data[:10000]
            empty_words =  np.where(data.sum(0).A[0] == 0)[0]
            new_cols = np.delete(np.arange(data.shape[1]), empty_words)
            self.data = data[:, new_cols]

    else:
        N = int(N)
        data = self.data[:N]
        # Here we come to streaming problem !
        # @DEBUG manage id2word
        if type(data) is np.array:
            empty_words =  np.where(data.sum(0) == 0)[0]
            self.data = np.delete(data, empty_words, axis=1)
        elif data.format == 'csr':
            empty_words =  np.where(data.sum(0).A[0] == 0)[0]
            new_cols = np.delete(np.arange(data.shape[1]), empty_words)
            self.data = data[:, new_cols]

    # @debug to remove
    _l =  (self.data >= 6000).sum(1).A.T[0]
    print(_l)
    tt = self.data[_l > 0]
    for t in tt:
        print(t[t>=6000])
    self.data = self.data[ _l == 0 ]
    return self.data
def shuffle_docs(self)
Source code
def shuffle_docs(self):
    self.shuffle_instances()
def template(self, dct)
Source code
def template(self, dct):
    text_templ = '''###### $corpus_name
    Building: $time minutes
    Documents: $instances
    Nnz: $nnz
    Nnz mean: $nnz_mean
    Nnz var: $nnz_var
    Vocabulary: $features
    train: $train_size
    test: $test_size
    \n'''
    return Template(templ).substitute(dct)
def textloader(self, target, bdir=None, corpus_name='', n=None)
Source code
def textloader(self, target, bdir=None, corpus_name="", n=None):
    if type(target) is str and os.path.isfile(target):
        bdir = os.path.dirname(target)
    elif bdir is None:
        bdir = self.basedir
    fn = 'corpus'
    if n:
        fn += str(n)
    elif type(target) is not str:
        n = len(target)
        fn += str(n)

    if corpus_name:
        fname = bdir + '/'+fn+'_' + corpus_name + '.mm'
    else:
        fname = bdir + '/'+fn+'.mm'

    if self._load_data and os.path.isfile(fname):
        data = gensim.corpora.MmCorpus(fname)
        data = gensim.matutils.corpus2csc(data, dtype=int).T
        id2word = dict(gensim.corpora.dictionary.Dictionary.load_from_text(fname + '.dico'))
    else:
        prin('re-Building Corpus...')
        raw_data, id2word = Vocabulary.parse_corpus(target)

        # Corpus will be in bag of words format !
        if type(raw_data) is list:
            voca = Vocabulary(exclude_stopwords=True)
            data = [voca.doc2bow(doc) for doc in raw_data]
            data = gensim.matutils.corpus2csc(data, dtype=int).T # Would be faster with #doc #term #nnz
        else:
            data = raw_data

        if self._save_data:
            make_path(bdir)
            _data = gensim.matutils.Sparse2Corpus(data, documents_columns=False)
            voca_gensim = gensim.corpora.dictionary.Dictionary.from_corpus(_data, id2word)
            voca_gensim.save_as_text(fname+'.dico')
            gensim.corpora.MmCorpus.serialize(fname=fname, corpus=_data)
            #@Debug how to get the corpus from list of list ?
            #_data = gensim.corpora.MmCorpus(fname)

    return data, id2word

Inherited members