List of usage examples for org.deeplearning4j.models.word2vec.wordstore.inmemory InMemoryLookupCache InMemoryLookupCache
public InMemoryLookupCache()
From source file:de.mpii.docsimilarity.mr.utils.io.WordVectorSerializer.java
License:Apache License
/** * Loads an in memory cache from the given path (sets syn0 and the vocab) * * @param vectorsFile/*from w ww . j av a 2 s . c o m*/ * the path of the file to load * @return * @throws FileNotFoundException */ public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile) throws FileNotFoundException { BufferedReader write = new BufferedReader(new FileReader(vectorsFile)); VocabCache cache = new InMemoryLookupCache(); InMemoryLookupTable lookupTable; LineIterator iter = IOUtils.lineIterator(write); List<INDArray> arrays = new ArrayList<>(); while (iter.hasNext()) { String line = iter.nextLine(); String[] split = line.split(" "); String word = split[0]; VocabWord word1 = new VocabWord(1.0, word); cache.addToken(word1); cache.addWordToIndex(cache.numWords(), word); word1.setIndex(cache.numWords()); cache.putVocabWord(word); INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1)); for (int i = 1; i < split.length; i++) { row.putScalar(i - 1, Float.parseFloat(split[i])); } arrays.add(row); } INDArray syn = Nd4j.create(new int[] { arrays.size(), arrays.get(0).columns() }); for (int i = 0; i < syn.rows(); i++) { syn.putRow(i, arrays.get(i)); } lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().vectorLength(arrays.get(0).columns()) .useAdaGrad(false).cache(cache).build(); Nd4j.clearNans(syn); lookupTable.setSyn0(syn); iter.close(); return new Pair<>(lookupTable, cache); }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(File indexDir) throws Exception { boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false")); SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();// w w w . j av a 2s .c o m }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(String docFile) throws Exception { SentenceIterator iter = new BasicLineIterator(docFile); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();//from w ww . j a va 2 s .c o m }