Source code for word2vec.scripts_interface

from __future__ import division, print_function, unicode_literals

import sys
import subprocess


[docs]def word2vec(train, output, size=100, window=5, sample='1e-3', hs=0, negative=5, threads=12, iter_=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocab=None, read_vocab=None, verbose=False): """ word2vec execution Parameters for training: train <file> Use text data from <file> to train the model output <file> Use <file> to save the resulting word vectors / word clusters size <int> Set size of word vectors; default is 100 window <int> Set max skip length between words; default is 5 sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5 hs <int> Use Hierarchical Softmax; default is 1 (0 = not used) negative <int> Number of negative examples; default is 0, common values are 5 - 10 (0 = not used) threads <int> Use <int> threads (default 1) min_count <int> This will discard words that appear less than <int> times; default is 5 alpha <float> Set the starting learning rate; default is 0.025 debug <int> Set the debug mode (default = 2 = more info during training) binary <int> Save the resulting vectors in binary moded; default is 0 (off) cbow <int> Use the continuous back of words model; default is 1 (skip-gram model) save_vocab <file> The vocabulary will be saved to <file> read_vocab <file> The vocabulary will be read from <file>, not constructed from the training data verbose Print output from training """ command = ['word2vec'] args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug', '-binary', '-cbow'] values = [train, output, size, window, sample, hs, negative, threads, iter_, min_count, alpha, debug, binary, cbow] for arg, value in zip(args, values): command.append(arg) command.append(str(value)) if save_vocab is not None: command.append('-save-vocab') command.append(str(save_vocab)) if read_vocab is not None: command.append('-read-vocab') command.append(str(read_vocab)) run_cmd(command, verbose=verbose)
[docs]def word2clusters(train, output, classes, size=100, window=5, sample='1e-3', hs=0, negative=5, threads=12, iter_=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocab=None, read_vocab=None, verbose=False): command = ['word2vec'] args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug', '-binary', '-cbow', '-classes'] values = [train, output, size, window, sample, hs, negative, threads, iter_, min_count, alpha, debug, binary, cbow, classes] for arg, value in zip(args, values): command.append(arg) command.append(str(value)) if save_vocab is not None: command.append('-save-vocab') command.append(str(save_vocab)) if read_vocab is not None: command.append('-read-vocab') command.append(str(read_vocab)) run_cmd(command, verbose=verbose)
[docs]def word2phrase(train, output, min_count=5, threshold=100, debug=2, verbose=False): command = ['word2phrase'] args = ['-train', '-output', '-min-count', '-threshold', '-debug'] values = [train, output, min_count, threshold, debug] for arg, value in zip(args, values): command.append(arg) command.append(str(value)) run_cmd(command, verbose=verbose)
[docs]def doc2vec(train, output, size=100, window=5, sample='1e-3', hs=0, negative=5, threads=12, iter_=5, min_count=5, alpha=0.025, debug=2, binary=1, cbow=1, save_vocab=None, read_vocab=None, verbose=False): command = ['word2vec-doc2vec'] args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug', '-binary', '-cbow'] values = [train, output, size, window, sample, hs, negative, threads, iter_, min_count, alpha, debug, binary, cbow] for arg, value in zip(args, values): command.append(arg) command.append(str(value)) if save_vocab is not None: command.append('-save-vocab') command.append(str(save_vocab)) if read_vocab is not None: command.append('-read-vocab') command.append(str(read_vocab)) command.append('sentence-vectors') command.append('1') run_cmd(command, verbose=verbose)
[docs]def run_cmd(command, verbose=False): proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verbose: for line in proc.stdout: line = str(line) sys.stdout.write(line) if 'ERROR:' in line: raise Exception(line) sys.stdout.flush() out, err = proc.communicate()