Source code for word2vec.wordclusters

from __future__ import division, print_function, unicode_literals

import numpy as np


[docs]class WordClusters(object):

[docs]    def __init__(self, vocab, clusters):
        self.vocab = vocab
        self.clusters = clusters

[docs]    def ix(self, word):
        """
        Returns the index on self.vocab and self.clusters for `word`
        """
        temp = np.where(self.vocab == word)[0]
        if temp.size == 0:
            raise KeyError('Word not in vocabulary')
        else:
            return temp[0]

    def __getitem__(self, word):
        return self.get_cluster(word)

[docs]    def get_cluster(self, word):
        """
        Returns the cluster number for a word in the vocabulary
        """
        idx = self.ix(word)
        return self.clusters[idx]

[docs]    def get_words_on_cluster(self, cluster):
        return self.vocab[self.clusters == cluster]

    @classmethod
[docs]    def from_text(cls, fname):
        vocab = np.genfromtxt(fname, dtype=np.object, delimiter=' ', usecols=0)
        clusters = np.genfromtxt(fname, dtype=int, delimiter=' ', usecols=1)
        return cls(vocab=vocab, clusters=clusters)