List of usage examples for org.deeplearning4j.text.tokenization.tokenizer.preprocessor CommonPreprocessor CommonPreprocessor
CommonPreprocessor
From source file:com.github.tteofili.p2h.Par2HierTest.java
License:Apache License
@Test public void testP2HOnMTPapers() throws Exception { ParagraphVectors paragraphVectors;//from w w w .j a v a 2 s . co m LabelAwareIterator iterator; TokenizerFactory tokenizerFactory; ClassPathResource resource = new ClassPathResource("papers/sbc"); // build a iterator for our MT papers dataset iterator = new FilenamesLabelAwareIterator.Builder().addSourceFolder(resource.getFile()).build(); tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); Map<String, INDArray> hvs = new TreeMap<>(); Map<String, INDArray> pvs = new TreeMap<>(); paragraphVectors = new ParagraphVectors.Builder().iterate(iterator).tokenizerFactory(tokenizerFactory) .build(); // fit model paragraphVectors.fit(); Par2Hier par2Hier = new Par2Hier(paragraphVectors, method, k); // fit model par2Hier.fit(); Map<String, String[]> comparison = new TreeMap<>(); // extract paragraph vectors similarities WeightLookupTable<VocabWord> lookupTable = paragraphVectors.getLookupTable(); List<String> labels = paragraphVectors.getLabelsSource().getLabels(); for (String label : labels) { INDArray vector = lookupTable.vector(label); pvs.put(label, vector); Collection<String> strings = paragraphVectors.nearestLabels(vector, 2); Collection<String> hstrings = par2Hier.nearestLabels(vector, 2); String[] stringsArray = new String[2]; stringsArray[0] = new LinkedList<>(strings).get(1); stringsArray[1] = new LinkedList<>(hstrings).get(1); comparison.put(label, stringsArray); hvs.put(label, par2Hier.getLookupTable().vector(label)); } System.out.println("--->func(args):pv,p2h"); // measure similarity indexes double[] intraDocumentSimilarity = getIntraDocumentSimilarity(comparison); System.out.println("ids(" + k + "," + method + "):" + Arrays.toString(intraDocumentSimilarity)); double[] depthSimilarity = getDepthSimilarity(comparison); System.out.println("ds(" + k + "," + method + "):" + Arrays.toString(depthSimilarity)); // classification Map<Integer, Map<Integer, Long>> pvCounts = new HashMap<>(); Map<Integer, Map<Integer, Long>> p2hCounts = new HashMap<>(); for (String label : labels) { INDArray vector = lookupTable.vector(label); int topN = 1; Collection<String> strings = paragraphVectors.nearestLabels(vector, topN); Collection<String> hstrings = par2Hier.nearestLabels(vector, topN); int labelDepth = label.split("\\.").length - 1; int stringDepth = getClass(strings); int hstringDepth = getClass(hstrings); updateCM(pvCounts, labelDepth, stringDepth); updateCM(p2hCounts, labelDepth, hstringDepth); } ConfusionMatrix pvCM = new ConfusionMatrix(pvCounts); ConfusionMatrix p2hCM = new ConfusionMatrix(p2hCounts); System.out.println("mf1(" + k + "," + method + "):" + pvCM.getF1Measure() + "," + p2hCM.getF1Measure()); System.out.println("acc(" + k + "," + method + "):" + pvCM.getAccuracy() + "," + p2hCM.getAccuracy()); // create a CSV with a raw comparison File pvFile = Files.createFile(Paths.get("target/comparison-" + k + "-" + method + ".csv")).toFile(); FileOutputStream pvOutputStream = new FileOutputStream(pvFile); try { Map<String, INDArray> pvs2 = Par2HierUtils.svdPCA(pvs, 2); Map<String, INDArray> hvs2 = Par2HierUtils.svdPCA(hvs, 2); String pvCSV = asStrings(pvs2, hvs2); IOUtils.write(pvCSV, pvOutputStream); } finally { pvOutputStream.flush(); pvOutputStream.close(); } }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(File indexDir) throws Exception { boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false")); SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();// w w w . ja v a 2 s. c o m }
From source file:doc2vec.LuceneDocIterator.java
void learnDocEmbeddings(String docFile) throws Exception { SentenceIterator iter = new BasicLineIterator(docFile); InMemoryLookupCache cache = new InMemoryLookupCache(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); LabelsSource source = new LabelsSource("DOCNO_"); vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5) .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter) .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build(); vec.fit();/*from w w w. jav a 2 s . c o m*/ }
From source file:dollar.learner.smart.ParagraphVectorsClassifierExample.java
License:Apache License
void makeParagraphVectors() throws Exception { // build a iterator for our dataset File dir = TYPE_LEARNING_DIR; dir.mkdirs();//from w w w. j av a 2 s. co m iterator = new FileLabelAwareIterator.Builder().addSourceFolder(new File(dir, "corpus")).build(); tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); // ParagraphVectors training configuration paragraphVectors = new ParagraphVectors.Builder().learningRate(0.025).minLearningRate(0.001).batchSize(1000) .epochs(5).iterate(iterator).trainWordVectors(true).tokenizerFactory(tokenizerFactory).build(); // Start model training paragraphVectors.fit(); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("poem.txt"); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); /*/*from www .ja v a 2s. c o m*/ CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt"); String[] testwords = new String[] { "", "", "", "", "" }; for (String s : testwords) { Collection<String> lst = vec.wordsNearest(s, 5); System.out.println(s + " => " + lst); } Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5); System.out.println(kingList); }
From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java
public static void main(String[] args) throws Exception { log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = new BasicLineIterator("gov-annc.txt"); iter.setPreProcessor(new SentencePreProcessor() { @Override/*from w w w . java2 s. co m*/ public String preProcess(String sentence) { return sentence.toLowerCase(); } }); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); final EndingPreProcessor preProcessor = new EndingPreProcessor(); t.setTokenPreProcessor(new TokenPreProcess() { @Override public String preProcess(String token) { token = token.toLowerCase(); String base = preProcessor.preProcess(token); base = base.replaceAll("\\d", "d"); if (base.endsWith("ly") || base.endsWith("ing")) { System.out.println(); } return base; } }); /* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+ So, effectively all numbers, punctuation symbols and some special symbols are stripped off. Additionally it forces lower case for all tokens. */ t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt"); Collection<String> lst = vec.wordsNearest("information", 10); log.info("on 1st run: " + lst); System.out.println(vec.similarity("information", "data")); System.out.println(vec.similarity("information", "magic")); }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java
License:Open Source License
/** * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing * {@link ListCell}s containing the word vectors as {@link DoubleCell}s. * * @param wordVec the {@link WordVectors} model to use * @param document the document to use//from w w w . j a v a 2s . c om * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words */ private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) { final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory(); tokenizerFac.setTokenPreProcessor(new CommonPreprocessor()); final Tokenizer t = tokenizerFac.create(document); final List<ListCell> listCells = new ArrayList<ListCell>(); while (t.hasMoreTokens()) { final String word = t.nextToken(); if (!word.isEmpty()) { if (wordVec.hasWord(word)) { listCells.add(wordToListCell(wordVec, word)); } else { m_unknownWords.add(word); } } } return CollectionCellFactory.createListCell(listCells); }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java
License:Open Source License
/** * Calculates the mean vector of all word vectors of all words contained in a document. * * @param wordVec the {@link WordVectors} model to use * @param document the document for which the mean should be calculated * @return {@link INDArray} containing the mean vector of the document *//*from w ww.j a v a 2 s . co m*/ private INDArray calculateDocumentMean(final WordVectors wordVec, final String document) { final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory(); tokenizerFac.setTokenPreProcessor(new CommonPreprocessor()); final Tokenizer t = tokenizerFac.create(document); final List<String> tokens = t.getTokens(); int numberOfWordsMatchingWithVoc = 0; for (final String token : tokens) { if (wordVec.hasWord(token)) { numberOfWordsMatchingWithVoc++; } } final INDArray documentWordVectors = Nd4j.create(numberOfWordsMatchingWithVoc, wordVec.lookupTable().layerSize()); int i = 0; for (final String token : tokens) { if (!token.isEmpty()) { if (wordVec.hasWord(token)) { documentWordVectors.putRow(i, wordVec.getWordVectorMatrix(token)); i++; } else { m_unknownWords.add(token); } } } final INDArray documentMeanVector = documentWordVectors.mean(0); return documentMeanVector; }
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.learn.WordVectorLearnerNodeModel.java
License:Open Source License
@Override protected WordVectorPortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception { final BufferedDataTable table = (BufferedDataTable) inObjects[0]; TableUtils.checkForEmptyTable(table); final WordVectorTrainingMode mode = WordVectorTrainingMode.valueOf( m_wordVecParameterSettings.getString(WordVectorLearnerParameter.WORD_VECTOR_TRAINING_MODE)); final String labelColumnName = m_dataParameterSettings.getString(DataParameter.LABEL_COLUMN); final String documentColumnName = m_dataParameterSettings.getString(DataParameter.DOCUMENT_COLUMN); WordVectors wordVectors = null;/*from ww w . j av a 2s . com*/ // training parameters final int trainingIterations = m_learnerParameterSettings.getInteger(LearnerParameter.TRAINING_ITERATIONS); final int minWordFrequency = m_wordVecParameterSettings .getInteger(WordVectorLearnerParameter.MIN_WORD_FREQUENCY); final int layerSize = m_wordVecParameterSettings.getInteger(WordVectorLearnerParameter.LAYER_SIZE); final int seed = m_learnerParameterSettings.getInteger(LearnerParameter.SEED); final double learningRate = m_learnerParameterSettings.getDouble(LearnerParameter.GLOBAL_LEARNING_RATE); final double minLearningRate = m_wordVecParameterSettings .getDouble(WordVectorLearnerParameter.MIN_LEARNING_RATE); final int windowSize = m_wordVecParameterSettings.getInteger(WordVectorLearnerParameter.WINDOW_SIZE); final int epochs = m_dataParameterSettings.getInteger(DataParameter.EPOCHS); final int batchSize = m_dataParameterSettings.getInteger(DataParameter.BATCH_SIZE); // sentence tokenizer and preprocessing final boolean usePreproc = m_wordVecParameterSettings .getBoolean(WordVectorLearnerParameter.USE_BASIC_PREPROCESSING); final TokenizerFactory t = new DefaultTokenizerFactory(); if (usePreproc) { t.setTokenPreProcessor(new CommonPreprocessor()); } switch (mode) { case DOC2VEC: final LabelAwareIterator docIter = new BufferedDataTableLabelledDocumentIterator(table, documentColumnName, labelColumnName); // build doc2vec model final ParagraphVectors d2v = new ParagraphVectors.Builder().learningRate(learningRate) .minLearningRate(minLearningRate).seed(seed).layerSize(layerSize).batchSize(batchSize) .windowSize(windowSize).minWordFrequency(minWordFrequency).iterations(trainingIterations) .epochs(epochs).iterate(docIter).trainWordVectors(true).tokenizerFactory(t) .allowParallelTokenization(false).build(); d2v.fit(); wordVectors = d2v; break; case WORD2VEC: final SentenceIterator sentenceIter = new BufferedDataTableSentenceIterator(table, documentColumnName); // build word2vec model final Word2Vec w2v = new Word2Vec.Builder().learningRate(learningRate).minLearningRate(minLearningRate) .seed(seed).layerSize(layerSize).batchSize(batchSize).windowSize(windowSize) .minWordFrequency(minWordFrequency).iterations(trainingIterations).epochs(epochs) .iterate(sentenceIter).tokenizerFactory(t).allowParallelTokenization(false).build(); w2v.fit(); wordVectors = w2v; break; default: throw new InvalidSettingsException("No case defined for WordVectorTrainingMode: " + mode); } final WordVectorPortObject outPortObject = new WordVectorPortObject(wordVectors, m_outputSpec); return new WordVectorPortObject[] { outPortObject }; }
From source file:uk.bl.wa.nlp.wordvec.WordvecProcessor.java
License:Open Source License
public static void main(String[] args) throws Exception { SentenceIterator iter = new StanfordSentenceIterator(new FileReader("src/test/resources/Mona_Lisa.txt")); // Use Stanford NLP sentence splitter: // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit();// w w w .j a v a 2s .c o m log.info("Writing word vectors to text file...."); // Write word vectors WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt"); WordVectorSerializer.writeFullModel(vec, "pathToWriteto.model"); log.info("Closest Words:"); Collection<String> lst = vec.wordsNearest("french", 10); System.out.println(lst); // UiServer server = UiServer.getInstance(); // System.out.println("Started on port " + server.getPort()); }