com.mapr.synth.distributions.WordGenerator.java Source code

Introduction

Here is the source code for com.mapr.synth.distributions.WordGenerator.java
Source

/*
 * Licensed to the Ted Dunning under one or more contributor license
 * agreements.  See the NOTICE file that may be
 * distributed with this work for additional information
 * regarding copyright ownership.  Ted Dunning licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.mapr.synth.distributions;

import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * Emulates an infinite list of words, a prefix of which are taken from lists of plausible words.  The first words
 * are taken from a resource that has frequencies in it.  These frequencies can be used to initialize term
 * generators to a common language.  The next batch of words are taken from a long list of words with no frequencies.
 * After that, words are coined by using an integer count.
 */
public class WordGenerator {
    private final Logger log = LoggerFactory.getLogger(WordGenerator.class);

    private BufferedReader wordReader;
    private final List<String> words = Lists.newArrayList();
    private final Map<String, Integer> baseWeights = Maps.newLinkedHashMap();

    public WordGenerator(String seed, String others) {
        // read the common words
        if (seed != null) {
            try {
                Resources.readLines(Resources.getResource(seed), Charsets.UTF_8, new LineProcessor<Object>() {
                    private boolean header = true;
                    private final Splitter onTabs = Splitter.on("\t");

                    public boolean processLine(String s) throws IOException {
                        if (!s.startsWith("#")) {
                            if (!header) {
                                Iterator<String> fields = onTabs.split(s).iterator();
                                fields.next();
                                String word = fields.next();
                                words.add(word);
                                int count = (int) Math.rint(Double.parseDouble(fields.next()));
                                baseWeights.put(word, count);
                            } else {
                                header = false;
                            }
                        }
                        return true;
                    }

                    public Object getResult() {
                        return null;
                    }
                });
            } catch (IOException e) {
                log.error("Can't read resource \"{}\", will continue without realistic words", seed);
            }
        }

        try {
            wordReader = new BufferedReader(
                    Resources.newReaderSupplier(Resources.getResource(others), Charsets.UTF_8).getInput());
        } catch (IOException e) {
            log.error("Can't read resource \"{}\", will continue without realistic words", others);
            wordReader = null;
        }

    }

    public String getString(int n) {
        if (n >= words.size()) {
            synchronized (this) {
                while (n >= words.size()) {
                    try {
                        String w = wordReader.readLine();
                        if (w != null) {
                            words.add(w);
                        } else {
                            words.add("w-" + n);
                        }
                    } catch (IOException e) {
                        log.error("Error reading other words resource", e);
                        words.add("w-" + n);
                    }
                }
            }
        }
        return words.get(n);
    }

    public Map<String, Integer> getBaseWeights() {
        return baseWeights;
    }

    public int size() {
        return words.size();
    }
}