Example usage for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors

List of usage examples for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors

Introduction

In this page you can find the example usage for org.deeplearning4j.models.embeddings.loader WordVectorSerializer writeWordVectors.

Prototype

public static void writeWordVectors(@NonNull FastText vectors, @NonNull File path) throws IOException 

Source Link

Document

This method loads FastText model to file

Usage

From source file:DL4J.java

License:Open Source License

public void run() throws IOException {
    log.info("Load data...");
    //        ClassPathResource resource = new ClassPathResource("raw_sentences.txt");
    //        SentenceIterator iter = new LineSentenceIterator(resource.getFile());
    SentenceIterator iter = new LineSentenceIterator(new File("/Users/Joowon/Desktop/testFile.txt"));
    iter.setPreProcessor((SentencePreProcessor) String::toLowerCase);

    log.info("Tokenize data...");
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(token -> {
        token = token.toLowerCase();/*from w  w  w.  jav  a 2  s.  c om*/
        String base = preProcessor.preProcess(token);
        base = base.replaceAll("\\d", "d");
        if (base.endsWith("ly") || base.endsWith("ing"))
            System.out.println();
        return base;
    });

    log.info("Build model...");
    int batchSize = 1000;
    int iterations = 3;
    int layerSize = 300;
    Word2Vec vec = new Word2Vec.Builder().batchSize(batchSize) // # words per minibatch.
            .minWordFrequency(5) //
            .useAdaGrad(false) //
            .layerSize(layerSize) // word feature vector size
            .iterations(iterations) // # iterations to train
            .learningRate(0.025) //
            .minLearningRate(1e-3) // learning rate decays wrt # words. floor learning
            .negativeSample(10) // sample size 10 words
            .iterate(iter) //
            .tokenizerFactory(tokenizerFactory) //
            .build();
    vec.fit();

    log.info("evaluate model...");
    double sim = vec.similarity("people", "money");
    log.info("Similarity between peeple and money: " + sim);
    Collection<String> similar = vec.wordsNearest("human", 10);
    log.info("Similarity words to 'day' : " + similar);

    //        log.info("Plot TSNE");
    //        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
    //                .setMaxIter(1000)
    //                .stopLyingIteration(250)
    //                .learningRate(500)
    //                .useAdaGrad(false)
    //                .theta(0.5)
    //                .setMomentum(0.5)
    //                .normalize(true)
    //                .usePca(false)
    //                .build();
    //        vec.lookupTable().plotVocab(tsne);

    log.info("Save vectors....");
    WordVectorSerializer.writeWordVectors(vec, "/Users/Joowon/Desktop/words.txt");
}

From source file:doc2vec.LuceneDocIterator.java

public void processAll() throws Exception {
    System.out.println("Learning doc embeddings");

    /* Call this to train docvec on a file (each line a sentence)
    String docFileName = prop.getProperty("docvec.in.file");
    learnDocEmbeddings(docFileName);//from   w ww  . j  a v  a2  s  .  com
    */

    /* Call this to train doc2vec on the Lucene index..        
    */
    String indexPath = prop.getProperty("index");
    learnDocEmbeddings(new File(indexPath));

    String outDocVecFile = prop.getProperty("dvec.out.file");
    BufferedWriter bw = new BufferedWriter(new FileWriter(outDocVecFile));

    System.out.println("Writing out the doc vectors for indexing...");

    WordVectorSerializer.writeWordVectors(vec, bw);

    bw.close();
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningChi.java

public static void main(String[] args) throws Exception {
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("poem.txt");
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*/*from w  ww  .  j  a va  2 s .c  om*/
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(3).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "poem-vec-2.txt");

    String[] testwords = new String[] { "", "", "", "", "" };
    for (String s : testwords) {
        Collection<String> lst = vec.wordsNearest(s, 5);
        System.out.println(s + " => " + lst);
    }

    Collection<String> kingList = vec.wordsNearest(Arrays.asList("", ""), Arrays.asList(""), 5);
    System.out.println(kingList);
}

From source file:edu.polyu.comp5412.word2vec.Word2VecTrainningEng.java

public static void main(String[] args) throws Exception {

    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator("gov-annc.txt");
    iter.setPreProcessor(new SentencePreProcessor() {
        @Override// w ww .  j a v a 2s.co m
        public String preProcess(String sentence) {
            return sentence.toLowerCase();
        }
    });

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    final EndingPreProcessor preProcessor = new EndingPreProcessor();
    t.setTokenPreProcessor(new TokenPreProcess() {
        @Override
        public String preProcess(String token) {
            token = token.toLowerCase();
            String base = preProcessor.preProcess(token);
            base = base.replaceAll("\\d", "d");
            if (base.endsWith("ly") || base.endsWith("ing")) {
                System.out.println();
            }
            return base;
        }
    });
    /*
    CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    Additionally it forces lower case for all tokens.
     */
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();

    WordVectorSerializer.writeWordVectors(vec, "gov-annc-vec.txt");

    Collection<String> lst = vec.wordsNearest("information", 10);
    log.info("on 1st run: " + lst);

    System.out.println(vec.similarity("information", "data"));
    System.out.println(vec.similarity("information", "magic"));

}

From source file:uk.bl.wa.nlp.wordvec.WordvecProcessor.java

License:Open Source License

public static void main(String[] args) throws Exception {

    SentenceIterator iter = new StanfordSentenceIterator(new FileReader("src/test/resources/Mona_Lisa.txt"));

    // Use Stanford NLP sentence splitter:

    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).layerSize(100).seed(42)
            .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    log.info("Fitting Word2Vec model....");
    vec.fit();/* w  w  w  . j a  v  a2 s  . c o m*/

    log.info("Writing word vectors to text file....");

    // Write word vectors
    WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt");
    WordVectorSerializer.writeFullModel(vec, "pathToWriteto.model");

    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearest("french", 10);
    System.out.println(lst);
    // UiServer server = UiServer.getInstance();
    // System.out.println("Started on port " + server.getPort());

}