List of usage examples for org.deeplearning4j.text.tokenization.tokenizer Tokenizer hasMoreTokens
boolean hasMoreTokens();
From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java
License:Open Source License
/** * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing * {@link ListCell}s containing the word vectors as {@link DoubleCell}s. * * @param wordVec the {@link WordVectors} model to use * @param document the document to use//from w ww .j a v a 2 s. c om * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words */ private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) { final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory(); tokenizerFac.setTokenPreProcessor(new CommonPreprocessor()); final Tokenizer t = tokenizerFac.create(document); final List<ListCell> listCells = new ArrayList<ListCell>(); while (t.hasMoreTokens()) { final String word = t.nextToken(); if (!word.isEmpty()) { if (wordVec.hasWord(word)) { listCells.add(wordToListCell(wordVec, word)); } else { m_unknownWords.add(word); } } } return CollectionCellFactory.createListCell(listCells); }