Example usage for org.deeplearning4j.text.sentenceiterator LineSentenceIterator LineSentenceIterator

List of usage examples for org.deeplearning4j.text.sentenceiterator LineSentenceIterator LineSentenceIterator

Introduction

In this page you can find the example usage for org.deeplearning4j.text.sentenceiterator LineSentenceIterator LineSentenceIterator.

Prototype

public LineSentenceIterator(File f) 

Source Link

Usage

From source file:DL4J.java

License:Open Source License

public void run() throws IOException {
    log.info("Load data...");
    //        ClassPathResource resource = new ClassPathResource("raw_sentences.txt");
    //        SentenceIterator iter = new LineSentenceIterator(resource.getFile());
    SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt"));
    iter.setPreProcessor((SentencePreProcessor) String::toLowerCase);

    log.info("Tokenize data...");
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(token -> {
        token = token.toLowerCase();//ww w  .j  ava 2 s. co  m
        String base = preProcessor.preProcess(token);
        base = base.replaceAll("\\d", "d");
        if (base.endsWith("ly") || base.endsWith("ing"))
            System.out.println();
        return base;
    });

    log.info("Build model...");
    int batchSize = 1000;
    int iterations = 3;
    int layerSize = 300;
    Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch.
            .minWordFrequency(5) //
            .useAdaGrad(false) //
            .layerSize(layerSize) // word feature vector size
            .iterations(iterations) // # iterations to train
            .learningRate(0.025) //
            .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning
            .negativeSample(10) // sample size 10 words
            .iterate(iter) //
            .tokenizerFactory(tokenizerFactory) //
            .build();
    vec.fit();

    log.info("evaluate model...");
    double sim = vec.similarity("people", "money");
    log.info("Similarity between peeple and money: " + sim);
    Collection<String> similar = vec.wordsNearest("human", 10);
    log.info("Similarity words to 'day' : " + similar);

    //        log.info("Plot TSNE");
    //        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
    //                .setMaxIter(1000)
    //                .stopLyingIteration(250)
    //                .learningRate(500)
    //                .useAdaGrad(false)
    //                .theta(0.5)
    //                .setMomentum(0.5)
    //                .normalize(true)
    //                .usePca(false)
    //                .build();
    //        vec.lookupTable().plotVocab(tsne);

    log.info("Save vectors....");
    WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt");
}