Example usage for org.deeplearning4j.text.sentenceiterator SentenceIterator setPreProcessor

List of usage examples for org.deeplearning4j.text.sentenceiterator SentenceIterator setPreProcessor

Introduction

In this page you can find the example usage for org.deeplearning4j.text.sentenceiterator SentenceIterator setPreProcessor.

Prototype

void setPreProcessor(SentencePreProcessor preProcessor);

Source Link

Usage

From source file:DL4J.java

License:Open Source License

public void run() throws IOException {
    log.info("Load data...");
    //        ClassPathResource resource = new ClassPathResource("raw_sentences.txt");
    //        SentenceIterator iter = new LineSentenceIterator(resource.getFile());
    SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt"));
    iter.setPreProcessor((SentencePreProcessor) String::toLowerCase);

    log.info("Tokenize data...");
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(token -> {
        token = token.toLowerCase();/*from www  .  j av  a  2 s . c  om*/
        String base = preProcessor.preProcess(token);
        base = base.replaceAll("\\d", "d");
        if (base.endsWith("ly") || base.endsWith("ing"))
            System.out.println();
        return base;
    });

    log.info("Build model...");
    int batchSize = 1000;
    int iterations = 3;
    int layerSize = 300;
    Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch.
            .minWordFrequency(5) //
            .useAdaGrad(false) //
            .layerSize(layerSize) // word feature vector size
            .iterations(iterations) // # iterations to train
            .learningRate(0.025) //
            .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning
            .negativeSample(10) // sample size 10 words
            .iterate(iter) //
            .tokenizerFactory(tokenizerFactory) //
            .build();
    vec.fit();

    log.info("evaluate model...");
    double sim = vec.similarity("people", "money");
    log.info("Similarity between peeple and money: " + sim);
    Collection<String> similar = vec.wordsNearest("human", 10);
    log.info("Similarity words to 'day' : " + similar);

    //        log.info("Plot TSNE");
    //        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
    //                .setMaxIter(1000)
    //                .stopLyingIteration(250)
    //                .learningRate(500)
    //                .useAdaGrad(false)
    //                .theta(0.5)
    //                .setMomentum(0.5)
    //                .normalize(true)
    //                .usePca(false)
    //                .build();
    //        vec.lookupTable().plotVocab(tsne);

    log.info("Save vectors....");
    WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt");
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override//from w w  w .  jav a  2  s.  co m
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:net.liaocy.ml4j.nlp.word2vec.Train.java

public void train(Collection<String> commaSentences, Language lang, String modelName) throws IOException {

    System.out.println("Load & Vectorize Sentences....");
    SentenceIterator iter = new CollectionSentenceIterator(commaSentences);
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override/*from   ww w.ja  v  a  2  s  .  c o  m*/
        public String preProcess(String sentence) {
            return sentence;
        }
    });

    MyTokenizerFactory t = new MyTokenizerFactory(lang);
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            return token;
        }
    });

    System.out.println("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(0).iterations(1).layerSize(200).seed(42)
            .windowSize(5).learningRate(0.025).iterate(iter).tokenizerFactory(t).build();

    System.out.println("Fitting Word2Vec model....");
    vec.fit();

    System.out.println("Save Model...");
    this.saveModel(modelName, vec);
}