Example usage for org.deeplearning4j.text.tokenization.tokenizer.preprocessor CommonPreprocessor CommonPreprocessor

List of usage examples for org.deeplearning4j.text.tokenization.tokenizer.preprocessor CommonPreprocessor CommonPreprocessor

Introduction

In this page you can find the example usage for org.deeplearning4j.text.tokenization.tokenizer.preprocessor CommonPreprocessor CommonPreprocessor.

Prototype

CommonPreprocessor

Source Link

Usage

From source file:com.github.tteofili.p2h.Par2HierTest.java

License:Apache License

@Test
public void testP2HOnMTPapers() throws Exception {
    ParagraphVectors paragraphVectors;//from w  w  w .j a v a  2 s .  co m
    LabelAwareIterator iterator;
    TokenizerFactory tokenizerFactory;
    ClassPathResource resource = new ClassPathResource("papers/sbc");

    // build a iterator for our MT papers dataset
    iterator = new FilenamesLabelAwareIterator.Builder().addSourceFolder(resource.getFile()).build();

    tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

    Map<String, INDArray> hvs = new TreeMap<>();
    Map<String, INDArray> pvs = new TreeMap<>();

    paragraphVectors = new ParagraphVectors.Builder().iterate(iterator).tokenizerFactory(tokenizerFactory)
            .build();

    // fit model
    paragraphVectors.fit();

    Par2Hier par2Hier = new Par2Hier(paragraphVectors, method, k);

    // fit model
    par2Hier.fit();

    Map<String, String[]> comparison = new TreeMap<>();

    // extract paragraph vectors similarities
    WeightLookupTable<VocabWord> lookupTable = paragraphVectors.getLookupTable();
    List<String> labels = paragraphVectors.getLabelsSource().getLabels();
    for (String label : labels) {
        INDArray vector = lookupTable.vector(label);
        pvs.put(label, vector);
        Collection<String> strings = paragraphVectors.nearestLabels(vector, 2);
        Collection<String> hstrings = par2Hier.nearestLabels(vector, 2);
        String[] stringsArray = new String[2];
        stringsArray[0] = new LinkedList<>(strings).get(1);
        stringsArray[1] = new LinkedList<>(hstrings).get(1);
        comparison.put(label, stringsArray);
        hvs.put(label, par2Hier.getLookupTable().vector(label));
    }

    System.out.println("--->func(args):pv,p2h");

    // measure similarity indexes
    double[] intraDocumentSimilarity = getIntraDocumentSimilarity(comparison);
    System.out.println("ids(" + k + "," + method + "):" + Arrays.toString(intraDocumentSimilarity));
    double[] depthSimilarity = getDepthSimilarity(comparison);
    System.out.println("ds(" + k + "," + method + "):" + Arrays.toString(depthSimilarity));

    // classification
    Map<Integer, Map<Integer, Long>> pvCounts = new HashMap<>();
    Map<Integer, Map<Integer, Long>> p2hCounts = new HashMap<>();
    for (String label : labels) {

        INDArray vector = lookupTable.vector(label);
        int topN = 1;
        Collection<String> strings = paragraphVectors.nearestLabels(vector, topN);
        Collection<String> hstrings = par2Hier.nearestLabels(vector, topN);
        int labelDepth = label.split("\\.").length - 1;

        int stringDepth = getClass(strings);
        int hstringDepth = getClass(hstrings);

        updateCM(pvCounts, labelDepth, stringDepth);
        updateCM(p2hCounts, labelDepth, hstringDepth);
    }

    ConfusionMatrix pvCM = new ConfusionMatrix(pvCounts);
    ConfusionMatrix p2hCM = new ConfusionMatrix(p2hCounts);

    System.out.println("mf1(" + k + "," + method + "):" + pvCM.getF1Measure() + "," + p2hCM.getF1Measure());
    System.out.println("acc(" + k + "," + method + "):" + pvCM.getAccuracy() + "," + p2hCM.getAccuracy());

    // create a CSV with a raw comparison
    File pvFile = Files.createFile(Paths.get("target/comparison-" + k + "-" + method + ".csv")).toFile();
    FileOutputStream pvOutputStream = new FileOutputStream(pvFile);

    try {
        Map<String, INDArray> pvs2 = Par2HierUtils.svdPCA(pvs, 2);
        Map<String, INDArray> hvs2 = Par2HierUtils.svdPCA(hvs, 2);
        String pvCSV = asStrings(pvs2, hvs2);
        IOUtils.write(pvCSV, pvOutputStream);
    } finally {
        pvOutputStream.flush();
        pvOutputStream.close();
    }
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(File indexDir) throws Exception {

    boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));
    SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();//  w  w w .  ja  v  a 2  s. c  o m
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(String docFile) throws Exception {

    SentenceIterator iter = new BasicLineIterator(docFile);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();/*from   w w w.  jav  a  2  s . c  o m*/
}

From source file:dollar.learner.smart.ParagraphVectorsClassifierExample.java

License:Apache License

void makeParagraphVectors() throws Exception {

    // build a iterator for our dataset
    File dir = TYPE_LEARNING_DIR;
    dir.mkdirs();//from w  w  w.  j av a  2  s.  co  m
    iterator = new FileLabelAwareIterator.Builder().addSourceFolder(new File(dir, "corpus")).build();

    tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

    // ParagraphVectors training configuration
    paragraphVectors = new ParagraphVectors.Builder().learningRate(0.025).minLearningRate(0.001).batchSize(1000)
            .epochs(5).iterate(iterator).trainWordVectors(true).tokenizerFactory(tokenizerFactory).build();

    // Start model training
    paragraphVectors.fit();
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java

public static void main(String[] args) throws Exception {
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("poem.txt");
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*/*from   www  .ja  v a 2s.  c  o m*/
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt");

    String[] testwords = new String[] { "", "", "", "", "" };
    for (String s : testwords) {
        Collection<String> lst = vec.wordsNearest(s, 5);
        System.out.println(s + " => " + lst);
    }

    Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5);
    System.out.println(kingList);
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override/*from w  w  w  .  java2 s.  co  m*/
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not
 * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing
 * {@link ListCell}s containing the word vectors as {@link DoubleCell}s.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document to use//from w  w  w . j  a  v  a  2s .  c om
 * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words
 */
private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<ListCell> listCells = new ArrayList<ListCell>();

    while (t.hasMoreTokens()) {
        final String word = t.nextToken();
        if (!word.isEmpty()) {
            if (wordVec.hasWord(word)) {
                listCells.add(wordToListCell(wordVec, word));
            } else {
                m_unknownWords.add(word);
            }
        }
    }
    return CollectionCellFactory.createListCell(listCells);
}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Calculates the mean vector of all word vectors of all words contained in a document.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document for which the mean should be calculated
 * @return {@link INDArray} containing the mean vector of the document
 *//*from   w  ww.j a v a  2 s .  co m*/
private INDArray calculateDocumentMean(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<String> tokens = t.getTokens();

    int numberOfWordsMatchingWithVoc = 0;
    for (final String token : tokens) {
        if (wordVec.hasWord(token)) {
            numberOfWordsMatchingWithVoc++;
        }
    }

    final INDArray documentWordVectors = Nd4j.create(numberOfWordsMatchingWithVoc,
            wordVec.lookupTable().layerSize());

    int i = 0;
    for (final String token : tokens) {
        if (!token.isEmpty()) {
            if (wordVec.hasWord(token)) {
                documentWordVectors.putRow(i, wordVec.getWordVectorMatrix(token));
                i++;
            } else {
                m_unknownWords.add(token);
            }
        }
    }
    final INDArray documentMeanVector = documentWordVectors.mean(0);
    return documentMeanVector;
}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.learn.WordVectorLearnerNodeModel.java

License:Open Source License

@Override
protected WordVectorPortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec)
        throws Exception {
    final BufferedDataTable table = (BufferedDataTable) inObjects[0];

    TableUtils.checkForEmptyTable(table);

    final WordVectorTrainingMode mode = WordVectorTrainingMode.valueOf(
            m_wordVecParameterSettings.getString(WordVectorLearnerParameter.WORD_VECTOR_TRAINING_MODE));
    final String labelColumnName = m_dataParameterSettings.getString(DataParameter.LABEL_COLUMN);
    final String documentColumnName = m_dataParameterSettings.getString(DataParameter.DOCUMENT_COLUMN);
    WordVectors wordVectors = null;/*from ww  w  .  j av a 2s . com*/

    // training parameters
    final int trainingIterations = m_learnerParameterSettings.getInteger(LearnerParameter.TRAINING_ITERATIONS);
    final int minWordFrequency = m_wordVecParameterSettings
            .getInteger(WordVectorLearnerParameter.MIN_WORD_FREQUENCY);
    final int layerSize = m_wordVecParameterSettings.getInteger(WordVectorLearnerParameter.LAYER_SIZE);
    final int seed = m_learnerParameterSettings.getInteger(LearnerParameter.SEED);
    final double learningRate = m_learnerParameterSettings.getDouble(LearnerParameter.GLOBAL_LEARNING_RATE);
    final double minLearningRate = m_wordVecParameterSettings
            .getDouble(WordVectorLearnerParameter.MIN_LEARNING_RATE);
    final int windowSize = m_wordVecParameterSettings.getInteger(WordVectorLearnerParameter.WINDOW_SIZE);
    final int epochs = m_dataParameterSettings.getInteger(DataParameter.EPOCHS);
    final int batchSize = m_dataParameterSettings.getInteger(DataParameter.BATCH_SIZE);

    // sentence tokenizer and preprocessing
    final boolean usePreproc = m_wordVecParameterSettings
            .getBoolean(WordVectorLearnerParameter.USE_BASIC_PREPROCESSING);
    final TokenizerFactory t = new DefaultTokenizerFactory();
    if (usePreproc) {
        t.setTokenPreProcessor(new CommonPreprocessor());
    }

    switch (mode) {
    case DOC2VEC:
        final LabelAwareIterator docIter = new BufferedDataTableLabelledDocumentIterator(table,
                documentColumnName, labelColumnName);

        // build doc2vec model
        final ParagraphVectors d2v = new ParagraphVectors.Builder().learningRate(learningRate)
                .minLearningRate(minLearningRate).seed(seed).layerSize(layerSize).batchSize(batchSize)
                .windowSize(windowSize).minWordFrequency(minWordFrequency).iterations(trainingIterations)
                .epochs(epochs).iterate(docIter).trainWordVectors(true).tokenizerFactory(t)
                .allowParallelTokenization(false).build();

        d2v.fit();
        wordVectors = d2v;

        break;

    case WORD2VEC:
        final SentenceIterator sentenceIter = new BufferedDataTableSentenceIterator(table, documentColumnName);

        // build word2vec model
        final Word2Vec w2v = new Word2Vec.Builder().learningRate(learningRate).minLearningRate(minLearningRate)
                .seed(seed).layerSize(layerSize).batchSize(batchSize).windowSize(windowSize)
                .minWordFrequency(minWordFrequency).iterations(trainingIterations).epochs(epochs)
                .iterate(sentenceIter).tokenizerFactory(t).allowParallelTokenization(false).build();

        w2v.fit();
        wordVectors = w2v;

        break;

    default:
        throw new InvalidSettingsException("No case defined for WordVectorTrainingMode: " + mode);
    }

    final WordVectorPortObject outPortObject = new WordVectorPortObject(wordVectors, m_outputSpec);
    return new WordVectorPortObject[] { outPortObject };
}

From source file:uk.bl.wa.nlp.wordvec.WordvecProcessor.java

License:Open Source License

public static void main(String[] args) throws Exception {

    SentenceIterator iter = new StanfordSentenceIterator(new FileReader("src/test/resources/Mona_Lisa.txt"));

    // Use Stanford NLP sentence splitter:

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();// w w w  .j  a  v a 2s  .c o  m

    log.info("Writing word vectors to text file....");

    // Write word vectors
    WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt");
    WordVectorSerializer.writeFullModel(vec, "pathToWriteto.model");

    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearest("french", 10);
    System.out.println(lst);
    // UiServer server = UiServer.getInstance();
    // System.out.println("Started on port " + server.getPort());

}