List of usage examples for org.deeplearning4j.text.tokenization.tokenizerfactory DefaultTokenizerFactory DefaultTokenizerFactory
DefaultTokenizerFactory
From source file:DL4J.java
License:Open Source License
public void run() throws IOException { log.info("Load data..."); // ClassPathResource resource = new ClassPathResource("raw_sentences.txt"); // SentenceIterator iter = new LineSentenceIterator(resource.getFile()); SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt")); iter.setPreProcessor((SentencePreProcessor) String::toLowerCase); log.info("Tokenize data..."); final EndingPreProcessor preProcessor = new EndingPreProcessor(); TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(token -> { token = token.toLowerCase();/*from w w w. ja v a2 s . c o m*/ String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) System.out.println(); return base; }); log.info("Build model..."); int batchSize = 1000; int iterations = 3; int layerSize = 300; Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch. .minWordFrequency(5) // .useAdaGrad(false) // .layerSize(layerSize) // word feature vector size .iterations(iterations) // # iterations to train .learningRate(0.025) // .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning .negativeSample(10) // sample size 10 words .iterate(iter) // .tokenizerFactory(tokenizerFactory) // .build(); vec.fit(); log.info("evaluate model..."); double sim = vec.similarity("people", "money"); log.info("Similarity between peeple and money: " + sim); Collection<String> similar = vec.wordsNearest("human", 10); log.info("Similarity words to 'day' : " + similar); // log.info("Plot TSNE"); // BarnesHutTsne tsne = new BarnesHutTsne.Builder() // .setMaxIter(1000) // .stopLyingIteration(250) // .learningRate(500) // .useAdaGrad(false) // .theta(0.5) // .setMomentum(0.5) // .normalize(true) // .usePca(false) // .build(); // vec.lookupTable().plotVocab(tsne); log.info("Save vectors...."); WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt"); }
From source file:com.github.tteofili.p2h.Par2HierTest.java
License:Apache License
@Test public void testP2HOnMTPapers() throws Exception { ParagraphVectors paragraphVectors;//from www . j a v a 2 s . c o m LabelAwareIterator iterator; TokenizerFactory tokenizerFactory; ClassPathResource resource = new ClassPathResource("papers/sbc"); // build a iterator for our MT papers dataset iterator = new FilenamesLabelAwareIterator.Builder().addSourceFolder(resource.getFile()).build(); tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); Map<String, INDArray> hvs = new TreeMap<>(); Map<String, INDArray> pvs = new TreeMap<>(); paragraphVectors = new ParagraphVectors.Builder().iterate(iterator).tokenizerFactory(tokenizerFactory) .build(); // fit model paragraphVectors.fit(); Par2Hier par2Hier = new Par2Hier(paragraphVectors, method, k); // fit model par2Hier.fit(); Map<String, String[]> comparison = new TreeMap<>(); // extract paragraph vectors similarities WeightLookupTable<VocabWord> lookupTable = paragraphVectors.getLookupTable(); List<String> labels = paragraphVectors.getLabelsSource().getLabels(); for (String label : labels) { INDArray vector = lookupTable.vector(label); pvs.put(label, vector); Collection<String> strings = paragraphVectors.nearestLabels(vector, 2); Collection<String> hstrings = par2Hier.nearestLabels(vector, 2); String[] stringsArray = new String[2]; stringsArray[0] = new LinkedList<>(strings).get(1); stringsArray[1] = new LinkedList<>(hstrings).get(1); comparison.put(label, stringsArray); hvs.put(label, par2Hier.getLookupTable().vector(label)); } System.out.println("--->func(args):pv,p2h"); // measure similarity indexes double[] intraDocumentSimilarity = getIntraDocumentSimilarity(comparison); System.out.println("ids(" + k + "," + method + "):" + Arrays.toString(intraDocumentSimilarity)); double[] depthSimilarity = getDepthSimilarity(comparison); System.out.println("ds(" + k + "," + method + "):" + Arrays.toString(depthSimilarity)); // classification Map<Integer, Map<Integer, Long>> pvCounts = new HashMap<>(); Map<Integer, Map<Integer, Long>> p2hCounts = new HashMap<>(); for (String label : labels) { INDArray vector = lookupTable.vector(label); int topN = 1; Collection<String> strings = paragraphVectors.nearestLabels(vector, topN); Collection<String> hstrings = par2Hier.nearestLabels(vector, topN); int labelDepth = label.split("\\.").length - 1; int stringDepth = getClass(strings); int hstringDepth = getClass(hstrings); updateCM(pvCounts, labelDepth, stringDepth); updateCM(p2hCounts, labelDepth, hstringDepth); } ConfusionMatrix pvCM = new ConfusionMatrix(pvCounts); ConfusionMatrix p2hCM = new ConfusionMatrix(p2hCounts); System.out.println("mf1(" + k + "," + method + "):" + pvCM.getF1Measure() + "," + p2hCM.getF1Measure()); System.out.println("acc(" + k + "," + method + "):" + pvCM.getAccuracy() + "," + p2hCM.getAccuracy()); // create a CSV with a raw comparison File pvFile = Files.createFile(Paths.get("target/comparison-" + k + "-" + method + ".csv")).toFile(); FileOutputStream pvOutputStream = new FileOutputStream(pvFile); try { Map<String, INDArray> pvs2 = Par2HierUtils.svdPCA(pvs, 2); Map<String, INDArray> hvs2 = Par2HierUtils.svdPCA(hvs, 2); String pvCSV = asStrings(pvs2, hvs2); IOUtils.write(pvCSV, pvOutputStream); } finally { pvOutputStream.flush(); pvOutputStream.close(); } }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(File indexDir) throws Exception { boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false")); SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();/* www.ja v a 2 s. c om*/ }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(String docFile) throws Exception { SentenceIterator iter = new BasicLineIterator(docFile); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();/*from w w w .j ava 2 s .co m*/ }
From source file:dollar.learner.smart.ParagraphVectorsClassifierExample.java
License:Apache License
void makeParagraphVectors() throws Exception { // build a iterator for our dataset File dir = TYPE_LEARNING_DIR; dir.mkdirs();// w w w .j ava2 s . com iterator = new FileLabelAwareIterator.Builder().addSourceFolder(new File(dir, "corpus")).build(); tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); // ParagraphVectors training configuration paragraphVectors = new ParagraphVectors.Builder().learningRate(0.025).minLearningRate(0.001).batchSize(1000) .epochs(5).iterate(iterator).trainWordVectors(true).tokenizerFactory(tokenizerFactory).build(); // Start model training paragraphVectors.fit(); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("poem.txt"); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); /*/*from www . java 2 s . c o m*/ CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt"); String[] testwords = new String[] { "", "", "", "", "" }; for (String s : testwords) { Collection<String> lst = vec.wordsNearest(s, 5); System.out.println(s + " => " + lst); } Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5); System.out.println(kingList); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override/* w ww . j a v a 2 s . c o m*/ public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java
License:Open Source License
/** * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing * {@link ListCell}s containing the word vectors as {@link DoubleCell}s. * * @param wordVec the {@link WordVectors} model to use * @param document the document to use/*from ww w .j av a2 s.c o m*/ * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words */ private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) { final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory(); tokenizerFac.setTokenPreProcessor(new CommonPreprocessor()); final Tokenizer t = tokenizerFac.create(document); final List<ListCell> listCells = new ArrayList<ListCell>(); while (t.hasMoreTokens()) { final String word = t.nextToken(); if (!word.isEmpty()) { if (wordVec.hasWord(word)) { listCells.add(wordToListCell(wordVec, word)); } else { m_unknownWords.add(word); } } } return CollectionCellFactory.createListCell(listCells); }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java
License:Open Source License
/** * Calculates the mean vector of all word vectors of all words contained in a document. * * @param wordVec the {@link WordVectors} model to use * @param document the document for which the mean should be calculated * @return {@link INDArray} containing the mean vector of the document *//*from w w w.j a v a 2s.c o m*/ private INDArray calculateDocumentMean(final WordVectors wordVec, final String document) { final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory(); tokenizerFac.setTokenPreProcessor(new CommonPreprocessor()); final Tokenizer t = tokenizerFac.create(document); final List<String> tokens = t.getTokens(); int numberOfWordsMatchingWithVoc = 0; for (final String token : tokens) { if (wordVec.hasWord(token)) { numberOfWordsMatchingWithVoc++; } } final INDArray documentWordVectors = Nd4j.create(numberOfWordsMatchingWithVoc, wordVec.lookupTable().layerSize()); int i = 0; for (final String token : tokens) { if (!token.isEmpty()) { if (wordVec.hasWord(token)) { documentWordVectors.putRow(i, wordVec.getWordVectorMatrix(token)); i++; } else { m_unknownWords.add(token); } } } final INDArray documentMeanVector = documentWordVectors.mean(0); return documentMeanVector; }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel2.java
License:Open Source License
private DataRow processRow(final DataRow row, final int documentColumnIndex, final WordVectors wordVectors) throws DataCellConversionException, IllegalStateException { final List<DataCell> cells = TableUtils.toListOfCells(row); final DataCell cell = row.getCell(documentColumnIndex); final String document = ConverterUtils.convertDataCellToJava(cell, String.class); ListCell convertedDocument;//from w w w . j a v a2 s . c o m final Tokenizer t = new DefaultTokenizerFactory().create(document); final List<String> matchingTokens = new ArrayList<String>(); for (final String token : t.getTokens()) { if (wordVectors.hasWord(token)) { matchingTokens.add(token); } else { m_unknownWordsCtr++; } m_totalWordsCtr++; } if (matchingTokens.size() == 0) { cells.add(new MissingCell("No tokens in row " + row.getKey() + " match the vocabulary!")); } else { if (m_calculateMean.getBooleanValue()) { final INDArray documentMeanVector = calculateDocumentMean(wordVectors, matchingTokens); convertedDocument = CollectionCellFactory .createListCell(NDArrayUtils.toListOfDoubleCells(documentMeanVector)); } else { convertedDocument = replaceWordsByWordVector(wordVectors, matchingTokens); } cells.add(convertedDocument); } return new DefaultRow(row.getKey(), cells); }