Example usage for org.deeplearning4j.text.sentenceiterator SentencePreProcessor SentencePreProcessor

List of usage examples for org.deeplearning4j.text.sentenceiterator SentencePreProcessor SentencePreProcessor

Introduction

In this page you can find the example usage for org.deeplearning4j.text.sentenceiterator SentencePreProcessor SentencePreProcessor.

Prototype

SentencePreProcessor

Source Link

Usage

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override/*from   www  .  ja  v  a2 s  .  c  om*/
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:net.liaocy.ml4j.nlp.word2vec.Train.java

public void train(Collection<String> commaSentences, Language lang, String modelName) throws IOException {

    System.out.println("Load & Vectorize Sentences....");
    SentenceIterator iter = new CollectionSentenceIterator(commaSentences);
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override//from   ww w. j a v a  2 s. c om
        public String preProcess(String sentence) {
            return sentence;
        }
    });

    MyTokenizerFactory t = new MyTokenizerFactory(lang);
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            return token;
        }
    });

    System.out.println("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(0).iterations(1).layerSize(200).seed(42)
            .windowSize(5).learningRate(0.025).iterate(iter).tokenizerFactory(t).build();

    System.out.println("Fitting Word2Vec model....");
    vec.fit();

    System.out.println("Save Model...");
    this.saveModel(modelName, vec);
}