Example usage for org.deeplearning4j.text.sentenceiterator BasicLineIterator BasicLineIterator

List of usage examples for org.deeplearning4j.text.sentenceiterator BasicLineIterator BasicLineIterator

Introduction

In this page you can find the example usage for org.deeplearning4j.text.sentenceiterator BasicLineIterator BasicLineIterator.

Prototype

public BasicLineIterator(@NonNull String filePath) throws FileNotFoundException 

Source Link

Usage

From source file:doc2vec.LuceneDocIterator.java

void learnDocEmbeddings(String docFile) throws Exception {

    SentenceIterator iter = new BasicLineIterator(docFile);
    InMemoryLookupCache cache = new InMemoryLookupCache();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOCNO_");

    vec = new ParagraphVectors.Builder().minWordFrequency(minwordfreq).iterations(3).epochs(5)
            .layerSize(numDimensions).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
            .vocabCache(cache).tokenizerFactory(t).sampling(0.1f).workers(4).trainWordVectors(true).build();
    vec.fit();//ww w . j ava 2  s.  c  om
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java

public static void main(String[] args) throws Exception {
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("poem.txt");
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*//from   w w  w.  j  a v  a 2 s .  co  m
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt");

    String[] testwords = new String[] { "", "", "", "", "" };
    for (String s : testwords) {
        Collection<String> lst = vec.wordsNearest(s, 5);
        System.out.println(s + " => " + lst);
    }

    Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5);
    System.out.println(kingList);
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override//from  ww w .j a  v  a 2 s .co  m
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}