Example usage for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors

Introduction

In this page you can find the example usage for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors.

Prototype

public static void writeWordVectors(@NonNull FastText vectors, @NonNull File path) throws IOException

Source Link

Document

This method loads FastText model to file

Usage

From source file:DL4J.java

License:Open Source License

public void run() throws IOException {
    log.info("Load data...");
    //        ClassPathResource resource = new ClassPathResource("raw_sentences.txt");
    //        SentenceIterator iter = new LineSentenceIterator(resource.getFile());
    SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt"));
    iter.setPreProcessor((SentencePreProcessor) String::toLowerCase);

    log.info("Tokenize data...");
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(token -> {
        token = token.toLowerCase();/*from w  w  w.  jav  a 2  s.  c om*/
        String base = preProcessor.preProcess(token);
        base = base.replaceAll("\\d", "d");
        if (base.endsWith("ly") || base.endsWith("ing"))
            System.out.println();
        return base;
    });

    log.info("Build model...");
    int batchSize = 1000;
    int iterations = 3;
    int layerSize = 300;
    Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch.
            .minWordFrequency(5) //
            .useAdaGrad(false) //
            .layerSize(layerSize) // word feature vector size
            .iterations(iterations) // # iterations to train
            .learningRate(0.025) //
            .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning
            .negativeSample(10) // sample size 10 words
            .iterate(iter) //
            .tokenizerFactory(tokenizerFactory) //
            .build();
    vec.fit();

    log.info("evaluate model...");
    double sim = vec.similarity("people", "money");
    log.info("Similarity between peeple and money: " + sim);
    Collection<String> similar = vec.wordsNearest("human", 10);
    log.info("Similarity words to 'day' : " + similar);

    //        log.info("Plot TSNE");
    //        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
    //                .setMaxIter(1000)
    //                .stopLyingIteration(250)
    //                .learningRate(500)
    //                .useAdaGrad(false)
    //                .theta(0.5)
    //                .setMomentum(0.5)
    //                .normalize(true)
    //                .usePca(false)
    //                .build();
    //        vec.lookupTable().plotVocab(tsne);

    log.info("Save vectors....");
    WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt");
}

From source file:doc2vec.LuceneDocIterator.java

public void processAll() throws Exception {
    System.out.println("Learning doc embeddings");

    /* Call this to train docvec on a file (each line a sentence)
    String docFileName = prop.getProperty("docvec.in.file");
    learnDocEmbeddings(docFileName);//from   w ww  . j  a v  a2  s  .  com
    */

    /* Call this to train doc2vec on the Lucene index..        
    */
    String indexPath = prop.getProperty("index");
    learnDocEmbeddings(new File(indexPath));

    String outDocVecFile = prop.getProperty("dvec.out.file");
    BufferedWriter bw = new BufferedWriter(new FileWriter(outDocVecFile));

    System.out.println("Writing out the doc vectors for indexing...");

    WordVectorSerializer.writeWordVectors(vec, bw);

    bw.close();
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java

public static void main(String[] args) throws Exception {
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("poem.txt");
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*/*from w  ww  .  j  a va  2 s .c  om*/
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt");

    String[] testwords = new String[] { "", "", "", "", "" };
    for (String s : testwords) {
        Collection<String> lst = vec.wordsNearest(s, 5);
        System.out.println(s + " => " + lst);
    }

    Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5);
    System.out.println(kingList);
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override// w ww .  j a v a 2s.co m
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:uk.bl.wa.nlp.wordvec.WordvecProcessor.java

License:Open Source License

public static void main(String[] args) throws Exception {

    SentenceIterator iter = new StanfordSentenceIterator(new FileReader("src/test/resources/Mona_Lisa.txt"));

    // Use Stanford NLP sentence splitter:

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();/* w  w  w  . j a  v  a2 s  . c o m*/

    log.info("Writing word vectors to text file....");

    // Write word vectors
    WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt");
    WordVectorSerializer.writeFullModel(vec, "pathToWriteto.model");

    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearest("french", 10);
    System.out.println(lst);
    // UiServer server = UiServer.getInstance();
    // System.out.println("Started on port " + server.getPort());

}