Source code for word2vec.scripts_interface

from __future__ import division, print_function, unicode_literals

import sys
import subprocess


[docs]def word2vec(train, output, size=100, window=5, sample='1e-3', hs=0,
             negative=5, threads=12, iter_=5, min_count=5, alpha=0.025,
             debug=2, binary=1, cbow=1, save_vocab=None, read_vocab=None,
             verbose=False):
    """
    word2vec execution

    Parameters for training:
        train <file>
            Use text data from <file> to train the model
        output <file>
            Use <file> to save the resulting word vectors / word clusters
        size <int>
            Set size of word vectors; default is 100
        window <int>
            Set max skip length between words; default is 5
        sample <float>
            Set threshold for occurrence of words. Those that appear with
            higher frequency in the training data will be randomly
            down-sampled; default is 0 (off), useful value is 1e-5
        hs <int>
            Use Hierarchical Softmax; default is 1 (0 = not used)
        negative <int>
            Number of negative examples; default is 0, common values are 5 - 10
            (0 = not used)
        threads <int>
            Use <int> threads (default 1)
        min_count <int>
            This will discard words that appear less than <int> times; default
            is 5
        alpha <float>
            Set the starting learning rate; default is 0.025
        debug <int>
            Set the debug mode (default = 2 = more info during training)
        binary <int>
            Save the resulting vectors in binary moded; default is 0 (off)
        cbow <int>
            Use the continuous back of words model; default is 1 (skip-gram
            model)
        save_vocab <file>
            The vocabulary will be saved to <file>
        read_vocab <file>
            The vocabulary will be read from <file>, not constructed from the
            training data
        verbose
            Print output from training
    """
    command = ['word2vec']
    args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
            '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
            '-binary', '-cbow']
    values = [train, output, size, window, sample, hs, negative, threads,
              iter_, min_count, alpha, debug, binary, cbow]

    for arg, value in zip(args, values):
        command.append(arg)
        command.append(str(value))
    if save_vocab is not None:
        command.append('-save-vocab')
        command.append(str(save_vocab))
    if read_vocab is not None:
        command.append('-read-vocab')
        command.append(str(read_vocab))

    run_cmd(command, verbose=verbose)


[docs]def word2clusters(train, output, classes, size=100, window=5, sample='1e-3',
                  hs=0, negative=5, threads=12, iter_=5, min_count=5,
                  alpha=0.025, debug=2, binary=1, cbow=1,
                  save_vocab=None, read_vocab=None, verbose=False):
    command = ['word2vec']

    args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
            '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
            '-binary', '-cbow', '-classes']
    values = [train, output, size, window, sample, hs, negative, threads,
              iter_, min_count, alpha, debug, binary, cbow, classes]

    for arg, value in zip(args, values):
        command.append(arg)
        command.append(str(value))

    if save_vocab is not None:
        command.append('-save-vocab')
        command.append(str(save_vocab))
    if read_vocab is not None:
        command.append('-read-vocab')
        command.append(str(read_vocab))

    run_cmd(command, verbose=verbose)


[docs]def word2phrase(train, output, min_count=5, threshold=100, debug=2,
                verbose=False):
    command = ['word2phrase']

    args = ['-train', '-output', '-min-count', '-threshold', '-debug']
    values = [train, output, min_count, threshold, debug]
    for arg, value in zip(args, values):
        command.append(arg)
        command.append(str(value))

    run_cmd(command, verbose=verbose)


[docs]def doc2vec(train, output, size=100, window=5, sample='1e-3', hs=0, negative=5,
            threads=12, iter_=5, min_count=5, alpha=0.025, debug=2, binary=1,
            cbow=1,
            save_vocab=None, read_vocab=None, verbose=False):
    command = ['word2vec-doc2vec']
    args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
            '-negative', '-threads', '-iter', '-min-count', '-alpha',
            '-debug', '-binary', '-cbow']
    values = [train, output, size, window, sample, hs, negative, threads,
              iter_, min_count, alpha, debug, binary, cbow]

    for arg, value in zip(args, values):
        command.append(arg)
        command.append(str(value))
    if save_vocab is not None:
        command.append('-save-vocab')
        command.append(str(save_vocab))
    if read_vocab is not None:
        command.append('-read-vocab')
        command.append(str(read_vocab))

    command.append('sentence-vectors')
    command.append('1')

    run_cmd(command, verbose=verbose)


[docs]def run_cmd(command, verbose=False):
    proc = subprocess.Popen(command, stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    if verbose:
        for line in proc.stdout:
            line = str(line)
            sys.stdout.write(line)
            if 'ERROR:' in line:
                raise Exception(line)
            sys.stdout.flush()

    out, err = proc.communicate()