Example usage for org.deeplearning4j.text.tokenization.tokenizerfactory DefaultTokenizerFactory DefaultTokenizerFactory

List of usage examples for org.deeplearning4j.text.tokenization.tokenizerfactory DefaultTokenizerFactory DefaultTokenizerFactory

Introduction

In this page you can find the example usage for org.deeplearning4j.text.tokenization.tokenizerfactory DefaultTokenizerFactory DefaultTokenizerFactory.

Prototype

DefaultTokenizerFactory

Source Link

Usage

From source file:DL4J.java

License:Open Source License

public void run() throws IOException {
    log.info("Load data...");
    //        ClassPathResource resource = new ClassPathResource("raw_sentences.txt");
    //        SentenceIterator iter = new LineSentenceIterator(resource.getFile());
    SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt"));
    iter.setPreProcessor((SentencePreProcessor) String::toLowerCase);

    log.info("Tokenize data...");
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(token -> {
        token = token.toLowerCase();/*from  w  w  w.  ja  v a2 s . c o  m*/
        String base = preProcessor.preProcess(token);
        base = base.replaceAll("\\d", "d");
        if (base.endsWith("ly") || base.endsWith("ing"))
            System.out.println();
        return base;
    });

    log.info("Build model...");
    int batchSize = 1000;
    int iterations = 3;
    int layerSize = 300;
    Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch.
            .minWordFrequency(5) //
            .useAdaGrad(false) //
            .layerSize(layerSize) // word feature vector size
            .iterations(iterations) // # iterations to train
            .learningRate(0.025) //
            .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning
            .negativeSample(10) // sample size 10 words
            .iterate(iter) //
            .tokenizerFactory(tokenizerFactory) //
            .build();
    vec.fit();

    log.info("evaluate model...");
    double sim = vec.similarity("people", "money");
    log.info("Similarity between peeple and money: " + sim);
    Collection<String> similar = vec.wordsNearest("human", 10);
    log.info("Similarity words to 'day' : " + similar);

    //        log.info("Plot TSNE");
    //        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
    //                .setMaxIter(1000)
    //                .stopLyingIteration(250)
    //                .learningRate(500)
    //                .useAdaGrad(false)
    //                .theta(0.5)
    //                .setMomentum(0.5)
    //                .normalize(true)
    //                .usePca(false)
    //                .build();
    //        vec.lookupTable().plotVocab(tsne);

    log.info("Save vectors....");
    WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt");
}

From source file:com.github.tteofili.p2h.Par2HierTest.java

License:Apache License

@Test
public void testP2HOnMTPapers() throws Exception {
    ParagraphVectors paragraphVectors;//from   www  .  j  a v  a  2  s .  c o  m
    LabelAwareIterator iterator;
    TokenizerFactory tokenizerFactory;
    ClassPathResource resource = new ClassPathResource("papers/sbc");

    // build a iterator for our MT papers dataset
    iterator = new FilenamesLabelAwareIterator.Builder().addSourceFolder(resource.getFile()).build();

    tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

    Map<String, INDArray> hvs = new TreeMap<>();
    Map<String, INDArray> pvs = new TreeMap<>();

    paragraphVectors = new ParagraphVectors.Builder().iterate(iterator).tokenizerFactory(tokenizerFactory)
            .build();

    // fit model
    paragraphVectors.fit();

    Par2Hier par2Hier = new Par2Hier(paragraphVectors, method, k);

    // fit model
    par2Hier.fit();

    Map<String, String[]> comparison = new TreeMap<>();

    // extract paragraph vectors similarities
    WeightLookupTable<VocabWord> lookupTable = paragraphVectors.getLookupTable();
    List<String> labels = paragraphVectors.getLabelsSource().getLabels();
    for (String label : labels) {
        INDArray vector = lookupTable.vector(label);
        pvs.put(label, vector);
        Collection<String> strings = paragraphVectors.nearestLabels(vector, 2);
        Collection<String> hstrings = par2Hier.nearestLabels(vector, 2);
        String[] stringsArray = new String[2];
        stringsArray[0] = new LinkedList<>(strings).get(1);
        stringsArray[1] = new LinkedList<>(hstrings).get(1);
        comparison.put(label, stringsArray);
        hvs.put(label, par2Hier.getLookupTable().vector(label));
    }

    System.out.println("--->func(args):pv,p2h");

    // measure similarity indexes
    double[] intraDocumentSimilarity = getIntraDocumentSimilarity(comparison);
    System.out.println("ids(" + k + "," + method + "):" + Arrays.toString(intraDocumentSimilarity));
    double[] depthSimilarity = getDepthSimilarity(comparison);
    System.out.println("ds(" + k + "," + method + "):" + Arrays.toString(depthSimilarity));

    // classification
    Map<Integer, Map<Integer, Long>> pvCounts = new HashMap<>();
    Map<Integer, Map<Integer, Long>> p2hCounts = new HashMap<>();
    for (String label : labels) {

        INDArray vector = lookupTable.vector(label);
        int topN = 1;
        Collection<String> strings = paragraphVectors.nearestLabels(vector, topN);
        Collection<String> hstrings = par2Hier.nearestLabels(vector, topN);
        int labelDepth = label.split("\\.").length - 1;

        int stringDepth = getClass(strings);
        int hstringDepth = getClass(hstrings);

        updateCM(pvCounts, labelDepth, stringDepth);
        updateCM(p2hCounts, labelDepth, hstringDepth);
    }

    ConfusionMatrix pvCM = new ConfusionMatrix(pvCounts);
    ConfusionMatrix p2hCM = new ConfusionMatrix(p2hCounts);

    System.out.println("mf1(" + k + "," + method + "):" + pvCM.getF1Measure() + "," + p2hCM.getF1Measure());
    System.out.println("acc(" + k + "," + method + "):" + pvCM.getAccuracy() + "," + p2hCM.getAccuracy());

    // create a CSV with a raw comparison
    File pvFile = Files.createFile(Paths.get("target/comparison-" + k + "-" + method + ".csv")).toFile();
    FileOutputStream pvOutputStream = new FileOutputStream(pvFile);

    try {
        Map<String, INDArray> pvs2 = Par2HierUtils.svdPCA(pvs, 2);
        Map<String, INDArray> hvs2 = Par2HierUtils.svdPCA(hvs, 2);
        String pvCSV = asStrings(pvs2, hvs2);
        IOUtils.write(pvCSV, pvOutputStream);
    } finally {
        pvOutputStream.flush();
        pvOutputStream.close();
    }
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(File indexDir) throws Exception {

    boolean storedLabels = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));
    SentenceIterator iter = new LuceneDocIterator(indexDir, stopFile, storedLabels);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();/*  www.ja v  a 2 s.  c om*/
}

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(String docFile) throws Exception {

    SentenceIterator iter = new BasicLineIterator(docFile);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();/*from   w w  w  .j ava 2  s .co  m*/
}

From source file:dollar.learner.smart.ParagraphVectorsClassifierExample.java

License:Apache License

void makeParagraphVectors() throws Exception {

    // build a iterator for our dataset
    File dir = TYPE_LEARNING_DIR;
    dir.mkdirs();// w w w .j ava2  s  . com
    iterator = new FileLabelAwareIterator.Builder().addSourceFolder(new File(dir, "corpus")).build();

    tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

    // ParagraphVectors training configuration
    paragraphVectors = new ParagraphVectors.Builder().learningRate(0.025).minLearningRate(0.001).batchSize(1000)
            .epochs(5).iterate(iterator).trainWordVectors(true).tokenizerFactory(tokenizerFactory).build();

    // Start model training
    paragraphVectors.fit();
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java

public static void main(String[] args) throws Exception {
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("poem.txt");
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*/*from www  . java  2 s  .  c o m*/
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt");

    String[] testwords = new String[] { "", "", "", "", "" };
    for (String s : testwords) {
        Collection<String> lst = vec.wordsNearest(s, 5);
        System.out.println(s + " => " + lst);
    }

    Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5);
    System.out.println(kingList);
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override/*  w ww .  j a  v  a 2 s  . c  o m*/
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Replaces each word contained in a document with its corresponding word vector. If a word from the document is not
 * contained in the used {@link WordVectors} model it will be skipped. The output is a {@link ListCell} containing
 * {@link ListCell}s containing the word vectors as {@link DoubleCell}s.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document to use/*from   ww  w .j  av a2  s.c  o m*/
 * @return {@link ListCell} of {@link ListCell}c of {@link DoubleCell}s containing converted words
 */
private ListCell replaceWordsByWordVector(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<ListCell> listCells = new ArrayList<ListCell>();

    while (t.hasMoreTokens()) {
        final String word = t.nextToken();
        if (!word.isEmpty()) {
            if (wordVec.hasWord(word)) {
                listCells.add(wordToListCell(wordVec, word));
            } else {
                m_unknownWords.add(word);
            }
        }
    }
    return CollectionCellFactory.createListCell(listCells);
}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel.java

License:Open Source License

/**
 * Calculates the mean vector of all word vectors of all words contained in a document.
 *
 * @param wordVec the {@link WordVectors} model to use
 * @param document the document for which the mean should be calculated
 * @return {@link INDArray} containing the mean vector of the document
 *//*from  w  w w.j  a  v a  2s.c o m*/
private INDArray calculateDocumentMean(final WordVectors wordVec, final String document) {
    final TokenizerFactory tokenizerFac = new DefaultTokenizerFactory();
    tokenizerFac.setTokenPreProcessor(new CommonPreprocessor());

    final Tokenizer t = tokenizerFac.create(document);
    final List<String> tokens = t.getTokens();

    int numberOfWordsMatchingWithVoc = 0;
    for (final String token : tokens) {
        if (wordVec.hasWord(token)) {
            numberOfWordsMatchingWithVoc++;
        }
    }

    final INDArray documentWordVectors = Nd4j.create(numberOfWordsMatchingWithVoc,
            wordVec.lookupTable().layerSize());

    int i = 0;
    for (final String token : tokens) {
        if (!token.isEmpty()) {
            if (wordVec.hasWord(token)) {
                documentWordVectors.putRow(i, wordVec.getWordVectorMatrix(token));
                i++;
            } else {
                m_unknownWords.add(token);
            }
        }
    }
    final INDArray documentMeanVector = documentWordVectors.mean(0);
    return documentMeanVector;
}

From source file:org.knime.ext.textprocessing.dl4j.nodes.embeddings.apply.WordVectorApplyNodeModel2.java

License:Open Source License

private DataRow processRow(final DataRow row, final int documentColumnIndex, final WordVectors wordVectors)
        throws DataCellConversionException, IllegalStateException {

    final List<DataCell> cells = TableUtils.toListOfCells(row);
    final DataCell cell = row.getCell(documentColumnIndex);

    final String document = ConverterUtils.convertDataCellToJava(cell, String.class);
    ListCell convertedDocument;//from  w  w  w  .  j a  v  a2  s . c o m

    final Tokenizer t = new DefaultTokenizerFactory().create(document);

    final List<String> matchingTokens = new ArrayList<String>();
    for (final String token : t.getTokens()) {
        if (wordVectors.hasWord(token)) {
            matchingTokens.add(token);
        } else {
            m_unknownWordsCtr++;
        }
        m_totalWordsCtr++;
    }

    if (matchingTokens.size() == 0) {
        cells.add(new MissingCell("No tokens in row " + row.getKey() + " match the vocabulary!"));
    } else {
        if (m_calculateMean.getBooleanValue()) {
            final INDArray documentMeanVector = calculateDocumentMean(wordVectors, matchingTokens);
            convertedDocument = CollectionCellFactory
                    .createListCell(NDArrayUtils.toListOfDoubleCells(documentMeanVector));
        } else {
            convertedDocument = replaceWordsByWordVector(wordVectors, matchingTokens);
        }
        cells.add(convertedDocument);
    }
    return new DefaultRow(row.getKey(), cells);
}