Example usage for org.deeplearning4j.text.tokenization.tokenizerfactory TokenizerFactory create

List of usage examples for org.deeplearning4j.text.tokenization.tokenizerfactory TokenizerFactory create

Introduction

In this page you can find the example usage for org.deeplearning4j.text.tokenization.tokenizerfactory TokenizerFactory create.

Prototype

Tokenizer create(InputStream toTokenize);

Source Link

Document

Create a tokenizer based on an input stream

Usage

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not
 * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing
 * {@link ListCell}s containing the word vectors as {@link DoubleCell}s.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document to use/*from w w w . j  a  v a  2  s .  com*/
 * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words
 */
private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<ListCell> listCells = new ArrayList<ListCell>();

    while (t.hasMoreTokens()) {
        final String word = t.nextToken();
        if (!word.isEmpty()) {
            if (wordVec.hasWord(word)) {
                listCells.add(wordToListCell(wordVec, word));
            } else {
                m_unknownWords.add(word);
            }
        }
    }
    return CollectionCellFactory.createListCell(listCells);
}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Calculates the mean vector of all word vectors of all words contained in a document.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document for which the mean should be calculated
 * @return {@link INDArray} containing the mean vector of the document
 *//* w w  w.  j  a v a  2s . c  om*/
private INDArray calculateDocumentMean(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<String> tokens = t.getTokens();

    int numberOfWordsMatchingWithVoc = 0;
    for (final String token : tokens) {
        if (wordVec.hasWord(token)) {
            numberOfWordsMatchingWithVoc++;
        }
    }

    final INDArray documentWordVectors = Nd4j.create(numberOfWordsMatchingWithVoc,
            wordVec.lookupTable().layerSize());

    int i = 0;
    for (final String token : tokens) {
        if (!token.isEmpty()) {
            if (wordVec.hasWord(token)) {
                documentWordVectors.putRow(i, wordVec.getWordVectorMatrix(token));
                i++;
            } else {
                m_unknownWords.add(token);
            }
        }
    }
    final INDArray documentMeanVector = documentWordVectors.mean(0);
    return documentMeanVector;
}