Example usage for org.deeplearning4j.models.word2vec.wordstore.inmemory InMemoryLookupCache InMemoryLookupCache

List of usage examples for org.deeplearning4j.models.word2vec.wordstore.inmemory InMemoryLookupCache InMemoryLookupCache

Introduction

In this page you can find the example usage for org.deeplearning4j.models.word2vec.wordstore.inmemory InMemoryLookupCache InMemoryLookupCache.

Prototype

public InMemoryLookupCache() 

Source Link

Usage

From source file:de.mpii.docsimilarity.mr.utils.io.WordVectorSerializer.java

License:Apache License

/**
 * Loads an in memory cache from the given path (sets syn0 and the vocab)
 *
 * @param vectorsFile/*from w ww  . j  av a 2  s  . c o m*/
 *            the path of the file to load
 * @return
 * @throws FileNotFoundException
 */
public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile) throws FileNotFoundException {
    BufferedReader write = new BufferedReader(new FileReader(vectorsFile));
    VocabCache cache = new InMemoryLookupCache();

    InMemoryLookupTable lookupTable;

    LineIterator iter = IOUtils.lineIterator(write);
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
        String line = iter.nextLine();
        String[] split = line.split(" ");
        String word = split[0];
        VocabWord word1 = new VocabWord(1.0, word);
        cache.addToken(word1);
        cache.addWordToIndex(cache.numWords(), word);
        word1.setIndex(cache.numWords());
        cache.putVocabWord(word);
        INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1));
        for (int i = 1; i < split.length; i++) {
            row.putScalar(i - 1, Float.parseFloat(split[i]));
        }
        arrays.add(row);
    }

    INDArray syn = Nd4j.create(new int[] { arrays.size(), arrays.get(0).columns() });
    for (int i = 0; i < syn.rows(); i++) {
        syn.putRow(i, arrays.get(i));
    }

    lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().vectorLength(arrays.get(0).columns())
            .useAdaGrad(false).cache(cache).build();
    Nd4j.clearNans(syn);
    lookupTable.setSyn0(syn);

    iter.close();

    return new Pair<>(lookupTable, cache);
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(File indexDir) throws Exception {

    boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));
    SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();//  w  w  w  . j av a  2s  .c o  m
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(String docFile) throws Exception {

    SentenceIterator iter = new BasicLineIterator(docFile);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();//from  w  ww  . j  a va 2 s .c o m
}