Example usage for org.deeplearning4j.models.word2vec Word2Vec Word2Vec

List of usage examples for org.deeplearning4j.models.word2vec Word2Vec Word2Vec

Introduction

In this page you can find the example usage for org.deeplearning4j.models.word2vec Word2Vec Word2Vec.

Prototype

Word2Vec

Source Link

Usage

From source file:de.mpii.docsimilarity.mr.utils.io.WordVectorSerializer.java

License:Apache License

/**
 * @param modelFile/*www . j ava2 s  . co m*/
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 * @throws NumberFormatException
 */
private static Word2Vec readTextModel(File modelFile) throws IOException, NumberFormatException {
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    Word2Vec ret = new Word2Vec();
    try (BufferedReader reader = new BufferedReader(
            new InputStreamReader(GzipUtils.isCompressedFilename(modelFile.getName())
                    ? new GZIPInputStream(new FileInputStream(modelFile))
                    : new FileInputStream(modelFile)))) {
        String line = reader.readLine();
        String[] initial = line.split(" ");
        int words = Integer.parseInt(initial[0]);
        int layerSize = Integer.parseInt(initial[1]);
        syn0 = Nd4j.create(words, layerSize);

        cache = new InMemoryLookupCache(false);

        int currLine = 0;
        while ((line = reader.readLine()) != null) {
            String[] split = line.split(" ");
            assert split.length == layerSize + 1;
            String word = split[0];

            float[] vector = new float[split.length - 1];
            for (int i = 1; i < split.length; i++) {
                vector[i - 1] = Float.parseFloat(split[i]);
            }

            syn0.putRow(currLine, Transforms.unitVec(Nd4j.create(vector)));

            cache.addWordToIndex(cache.numWords(), word);
            cache.addToken(new VocabWord(1, word));
            cache.putVocabWord(word);

            currLine++;
        }

        lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache)
                .vectorLength(layerSize).build();
        lookupTable.setSyn0(syn0);

        ret.setVocab(cache);
        ret.setLookupTable(lookupTable);
    }
    return ret;
}

From source file:de.mpii.docsimilarity.mr.utils.io.WordVectorSerializer.java

License:Apache License

/**
 * Read a binary word2vec file./*w  w w .ja  v  a2  s .com*/
 *
 * @param modelFile
 *            the File to read
 * @param linebreaks
 *            if true, the reader expects each word/vector to be in a separate line, terminated
 *            by a line break
 * @return a {@link Word2Vec model}
 * @throws NumberFormatException
 * @throws IOException
 * @throws FileNotFoundException
 */
private static Word2Vec readBinaryModel(File modelFile, boolean linebreaks)
        throws NumberFormatException, IOException {
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    int words, size;
    try (BufferedInputStream bis = new BufferedInputStream(GzipUtils.isCompressedFilename(modelFile.getName())
            ? new GZIPInputStream(new FileInputStream(modelFile))
            : new FileInputStream(modelFile)); DataInputStream dis = new DataInputStream(bis)) {
        words = Integer.parseInt(readString(dis));
        size = Integer.parseInt(readString(dis));
        syn0 = Nd4j.create(words, size);
        cache = new InMemoryLookupCache(false);
        lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(size)
                .build();

        String word;
        for (int i = 0; i < words; i++) {

            word = readString(dis);
            log.trace("Loading " + word + " with word " + i);

            float[] vector = new float[size];

            for (int j = 0; j < size; j++) {
                vector[j] = readFloat(dis);
            }

            syn0.putRow(i, Transforms.unitVec(Nd4j.create(vector)));

            cache.addWordToIndex(cache.numWords(), word);
            cache.addToken(new VocabWord(1, word));
            cache.putVocabWord(word);

            if (linebreaks) {
                dis.readByte(); // line break
            }
        }
    }

    Word2Vec ret = new Word2Vec();

    lookupTable.setSyn0(syn0);
    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);
    return ret;

}

From source file:de.mpii.docsimilarity.mr.utils.io.WordVectorSerializer.java

License:Apache License

/**
 * Read a binary word2vec file.// w w w.  ja  va2 s.c  o  m
 *
 * @param modelFile
 *            the File to read
 * @return a {@link Word2Vec model}
 * @throws NumberFormatException
 * @throws IOException
 * @throws FileNotFoundException
 */
public static Word2Vec readBinaryModel(String modelFile, FSDataInputStream modelstream,
        Set<String> requiredTerms) throws NumberFormatException, IOException {
    boolean linebreaks = DEFAULT_LINEBREAKS;
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    int words, size;
    int count = 0;
    try (BufferedInputStream bis = new BufferedInputStream(
            GzipUtils.isCompressedFilename(modelFile) ? new GZIPInputStream(modelstream) : modelstream);
            DataInputStream dis = new DataInputStream(bis)) {
        words = Integer.parseInt(readString(dis));
        size = Integer.parseInt(readString(dis));
        System.out.println("words " + words + ", size " + size);
        syn0 = Nd4j.create(words, size);
        cache = new InMemoryLookupCache(false);
        lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(size)
                .build();

        String word;

        for (int i = 0; i < words; i++) {

            word = readString(dis);

            log.trace("Loading " + word + " with word " + i);

            float[] vector = new float[size];

            for (int j = 0; j < size; j++) {
                vector[j] = readFloat(dis);
            }
            if (requiredTerms.contains(word)) {
                syn0.putRow(i, Transforms.unitVec(Nd4j.create(vector)));
                cache.addWordToIndex(cache.numWords(), word);
                cache.addToken(new VocabWord(1, word));
                cache.putVocabWord(word);
                count++;
            }

            if (linebreaks) {
                dis.readByte(); // line break
            }
        }
    }

    Word2Vec ret = new Word2Vec();

    lookupTable.setSyn0(syn0);
    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);
    System.out.println("Load " + count + " terms in word2vec.");
    return ret;

}

From source file:org.knime.ext.textprocessing.dl4j.util.WordVectorPortObjectUtils.java

License:Open Source License

/**
 * Converts wordVectors to {@link Word2Vec}. Sets {@link WeightLookupTable} and {@link VocabCache}. Depending on
 * specified word vector type this may lead to information loss. E.g. labels for {@link ParagraphVectors}.
 *
 * @param wordVectors/* w w  w.  j ava2s. c o  m*/
 * @return Word2Vec containing vocab and lookup table
 */
public static Word2Vec wordVectorsToWord2Vec(final WordVectors wordVectors) {
    final Word2Vec w2v = new Word2Vec();
    w2v.setLookupTable(wordVectors.lookupTable());
    w2v.setVocab(wordVectors.vocab());
    return w2v;
}