Example usage for org.apache.commons.io LineIterator LineIterator

List of usage examples for org.apache.commons.io LineIterator LineIterator

Introduction

In this page you can find the example usage for org.apache.commons.io LineIterator LineIterator.

Prototype

public LineIterator(final Reader reader) throws IllegalArgumentException 

Source Link

Document

Constructs an iterator of the lines for a Reader.

Usage

From source file:de.tudarmstadt.lt.lm.app.SentPerp.java

void run(Reader r) {
    long l = 0;/*from  w w  w . j  a v a  2 s .  c o m*/
    for (LineIterator liter = new LineIterator(r); liter.hasNext();) {
        if (++l % 5000 == 0)
            LOG.info("{}: processing line {}.", _rmi_string, l);

        String line = liter.next();
        if (line.trim().isEmpty())
            continue;

        List<String> sentences;
        try {
            sentences = _lm_prvdr.splitSentences(line);
        } catch (Exception e) {
            LOG.error("{}: Could not split sentences from line {}: '{}'.", _rmi_string, l,
                    StringUtils.abbreviate(line, 100), e);
            continue;
        }

        for (String sentence : sentences) {
            double p_log10_sent = 0d;
            double num_words = 0d;
            List<String> tokens;
            List<String>[] ngrams;
            try {
                tokens = _lm_prvdr.tokenizeSentence(sentence);
                if (tokens == null || tokens.isEmpty())
                    continue;
                ngrams = _lm_prvdr.getNgramSequence(tokens);
                if (ngrams == null || ngrams.length == 0)
                    continue;
            } catch (Exception e) {
                LOG.error("{}: Could not get ngrams from line {}: '{}'.", _rmi_string, l,
                        StringUtils.abbreviate(line, 100), e);
                continue;
            }

            for (List<String> ngram : ngrams) {
                if (ngram.isEmpty())
                    continue;
                _num_ngrams++;
                try {

                    if (_lm_prvdr_oovref.ngramContainsOOV(ngram)) {
                        _oov_ngrams++;
                        if (_lm_prvdr_oovref.ngramEndsWithOOV(ngram)) {
                            _oov_terms++;
                            if (_no_oov)
                                continue;
                        }
                    }

                    double log10prob = _lm_prvdr.getNgramLog10Probability(ngram);
                    p_log10_sent += log10prob;
                    num_words++;

                    if (log10prob < _min_prob) {
                        _min_prob = log10prob;
                        _min_ngram = ngram;
                    }
                    if (log10prob > _max_prob) {
                        _max_prob = log10prob;
                        _max_ngram = ngram;
                    }
                } catch (Exception e) {
                    LOG.error("{}: Could not add ngram '{}' to perplexity.", _rmi_string, ngram);
                    continue;
                }
            }
            if (num_words == 0)
                continue;
            p_log10_sent = p_log10_sent / num_words;// (double)tokens.size();
            _num_sents++;
            _sum_log10_prob_sents += p_log10_sent;
        }
    }
}

From source file:de.tudarmstadt.lt.seg.app.Segmenter.java

private void run_sequential_line() throws Exception {
    ISentenceSplitter sentenceSplitter = newSentenceSplitter();
    ITokenizer tokenizer = newTokenizer();

    InputStream in = System.in;
    if (!"-".equals(_filename_in))
        in = new FileInputStream(_filename_in);
    LineIterator liter = new LineIterator(
            new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())));

    OutputStream out = System.out;
    if (!"-".equals(_filename_out))
        out = new FileOutputStream(_filename_out);
    PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset()));

    for (long lc = 0; liter.hasNext();) {
        if (++lc % 1000 == 0)
            System.err.format("Processing line %d ('%s')%n", lc, _filename_in);
        String l = liter.next().replace("\\t", "\t").replace("\\n", "\n");
        split_and_tokenize(new StringReader(l), String.format("%s:%d", _filename_in, lc), sentenceSplitter,
                tokenizer, _level_filter, _level_normalize, _merge_types, _merge_tokens, _separator_sentence,
                _separator_token, _separator_desc, w);
    }//from   w  w  w. j av a  2s .c o m
}

From source file:de.tudarmstadt.lt.lm.app.PerplexityClient.java

@SuppressWarnings("unchecked")
void run(Reader r) {
    long l = 0;//from w ww .  jav a  2  s.c o  m
    for (LineIterator liter = new LineIterator(r); liter.hasNext();) {
        if (++l % 5000 == 0)
            LOG.info("{}: processing line {}.", _rmi_string, l);

        String line = liter.next();
        if (line.trim().isEmpty())
            continue;

        List<String>[] ngrams;
        try {
            if (_one_ngram_per_line)
                ngrams = new List[] { Arrays.asList(line.split(" ")) };
            else
                ngrams = _lm_prvdr.getNgrams(line);
            if (ngrams == null || ngrams.length == 0)
                continue;
        } catch (Exception e) {
            LOG.error("{}: Could not get ngrams from line {}: '{}'.", _rmi_string, l,
                    StringUtils.abbreviate(line, 100), e);
            continue;
        }

        for (List<String> ngram : ngrams) {
            if (ngram.isEmpty())
                continue;
            _num_ngrams++;
            try {
                boolean oov = false;
                if (_lm_prvdr.ngramContainsOOV(ngram)) {
                    _oov_ngrams++;
                    if (_lm_prvdr.ngramEndsWithOOV(ngram)) {
                        _oov_terms++;
                        oov = true;
                    }
                }

                if (_lm_prvdr_oovref.ngramContainsOOV(ngram)) {
                    _oovreflm_oov_ngrams++;
                    if (_lm_prvdr_oovref.ngramEndsWithOOV(ngram)) {
                        _oovreflm_oov_terms++;
                        if (_no_oov_reflm || (_no_oov && oov))
                            continue;
                    }
                }

                double log10prob = _perplexity_all.addLog10Prob(ngram);
                _perplexity_file.addLog10Prob(ngram);
                if (log10prob < _min_prob) {
                    _min_prob = log10prob;
                    _min_ngram = ngram;
                }
                if (log10prob > _max_prob) {
                    _max_prob = log10prob;
                    _max_ngram = ngram;
                }
            } catch (Exception e) {
                LOG.error("{}: Could not add ngram '{}' to perplexity.", _rmi_string, ngram);
                continue;
            }
        }
    }
}

From source file:com.github.anba.es6draft.test262.Test262Info.java

private void readTagged(String descriptor, boolean lenient) throws MalformedDataException {
    assert descriptor != null && !descriptor.isEmpty();
    for (LineIterator lines = new LineIterator(new StringReader(descriptor)); lines.hasNext();) {
        String line = lines.next();
        Matcher m = tags.matcher(line);
        if (m.matches()) {
            String type = m.group(1);
            String val = m.group(2);
            switch (type) {
            case "description":
                this.description = requireNonNull(val, "description must not be null");
                break;
            case "noStrict":
                requireNull(val);
                this.noStrict = true;
                break;
            case "onlyStrict":
                requireNull(val);
                this.onlyStrict = true;
                break;
            case "negative":
                this.negative = true;
                this.errorType = Objects.toString(val, this.errorType);
                break;
            case "hostObject":
            case "reviewers":
            case "generator":
            case "verbatim":
            case "noHelpers":
            case "bestPractice":
            case "implDependent":
            case "author":
                // ignore for now
                break;
            // legacy
            case "strict_mode_negative":
                this.negative = true;
                this.onlyStrict = true;
                this.errorType = Objects.toString(val, this.errorType);
                break;
            case "strict_only":
                requireNull(val);
                this.onlyStrict = true;
                break;
            case "errortype":
                this.errorType = requireNonNull(val, "error-type must not be null");
                break;
            case "assertion":
            case "section":
            case "path":
            case "comment":
            case "name":
                // ignore for now
                break;
            default:
                // error
                if (lenient) {
                    break;
                }//from w  w  w.  j  ava 2s.c  om
                throw new MalformedDataException(String.format("unhandled type '%s' (%s)\n", type, this));
            }
        }
    }
}

From source file:dk.netarkivet.harvester.datamodel.PartialHarvest.java

/**
 * This method is a duplicate of the addSeeds method but for seedsFile parameter
 *
 * @param seedsFile a newline-separated File containing the seeds to be added
 * @param templateName the name of the template to be used
 * @param maxBytes Maximum number of bytes to harvest per domain
 * @param maxObjects Maximum number of objects to harvest per domain
 *//*  w  w  w . j a  v  a2  s .  c o m*/
public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects,
        Map<String, String> attributeValues) {
    ArgumentNotValid.checkNotNull(seedsFile, "seeds");
    ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist");
    ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
    if (!TemplateDAO.getInstance().exists(templateName)) {
        throw new UnknownID("No such template: " + templateName);
    }

    Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
    StringBuilder invalidMessage = new StringBuilder(
            "Unable to create an event harvest.\n" + "The following seeds are invalid:\n");
    boolean valid = true;

    // validate all the seeds in the file
    // those accepted are entered into the acceptedSeeds datastructure

    // Iterate through the contents of the file
    LineIterator seedIterator = null;
    try {
        seedIterator = new LineIterator(new FileReader(seedsFile));
        while (seedIterator.hasNext()) {
            String seed = seedIterator.next();
            boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
            if (!seedValid) {
                valid = false;
            }
        }
    } catch (IOException e) {
        throw new IOFailure("Unable to process seedsfile ", e);
    } finally {
        LineIterator.closeQuietly(seedIterator);
    }

    if (!valid) {
        throw new ArgumentNotValid(invalidMessage.toString());
    }

    addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
}

From source file:edu.cornell.med.icb.goby.modes.EmpiricalPMode.java

private int countLines(String inputFilename) throws FileNotFoundException {
    int lineCount = 0;
    LineIterator it = new LineIterator(new FileReader(inputFilename));
    while (it.hasNext()) {
        Object next = it.next();/*www .  java  2 s  . c  o m*/
        lineCount++;
    }
    it.close();
    return lineCount;
}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

public void create_ngram_index(File ngram_joined_counts_file) throws IOException {
    File index_dir = new File(_index_dir, "ngram");
    if (index_dir.exists()) {
        LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();/*w w w. j  ava2s . com*/
        } else
            return;
    }
    index_dir.mkdirs();

    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    // use 80 percent of the available total memory
    double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6;
    double percentage_ram_buffer = Properties.ramBufferPercentage();
    if (percentage_ram_buffer > 0) {
        double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer;
        LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)",
                percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb));
        iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb);
    }

    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_ngram = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(ngram_joined_counts_file);
    if (ngram_joined_counts_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));

    Document doc = new Document();
    Field f_ngram = new StringField("ngram", "", Store.YES);
    doc.add(f_ngram);
    Field f_n = new IntField("cardinality", 0, Store.YES);
    doc.add(f_n);
    Field f_word = new StringField("word", "", Store.YES);
    doc.add(f_word);
    Field f_hist = new StringField("history", "", Store.YES);
    doc.add(f_hist);
    Field f_lower = new StringField("lower", "", Store.YES);
    doc.add(f_lower);
    Field f_count = new StoredField("num", 0L);
    doc.add(f_count);

    Field[] f_follow = new Field[4];
    f_follow[0] = new StoredField("nf_s", 0L);
    doc.add(f_follow[0]);
    f_follow[1] = new StoredField("nf_N1", 0L);
    doc.add(f_follow[1]);
    f_follow[2] = new StoredField("nf_N2", 0L);
    doc.add(f_follow[2]);
    f_follow[3] = new StoredField("nf_N3", 0L);
    doc.add(f_follow[3]);
    Field[] f_precede = new Field[4];
    f_precede[0] = new StoredField("np_s", 0L);
    doc.add(f_precede[0]);
    f_precede[1] = new StoredField("np_N1", 0L);
    doc.add(f_precede[1]);
    f_precede[2] = new StoredField("np_N2", 0L);
    doc.add(f_precede[2]);
    f_precede[3] = new StoredField("np_N3", 0L);
    doc.add(f_precede[3]);
    Field[] f_followerprecede = new Field[4];
    f_followerprecede[0] = new StoredField("nfp_s", 0L);
    doc.add(f_followerprecede[0]);
    f_followerprecede[1] = new StoredField("nfp_N1", 0L);
    doc.add(f_followerprecede[1]);
    f_followerprecede[2] = new StoredField("nfp_N2", 0L);
    doc.add(f_followerprecede[2]);
    f_followerprecede[3] = new StoredField("nfp_N3", 0L);
    doc.add(f_followerprecede[3]);

    Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } };
    Long[] S = new Long[] { 0L };
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 100000 == 0)
            LOG.info("Adding {}'th ngram.", c);
        String line = iter.next();
        try {
            String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t");
            String ngram_str = splits[0];
            if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) {
                LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line,
                        ngram_joined_counts_file);
                continue;
            }

            List<String> ngram = Arrays.asList(ngram_str.split(" "));
            long num = Long.parseLong(splits[1]);
            int n = ngram.size();

            f_ngram.setStringValue(ngram_str);
            f_n.setIntValue(n);
            f_word.setStringValue(ngram.get(ngram.size() - 1));
            f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " "));
            f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " "));
            f_count.setLongValue(num);

            for (int j = 0; j < f_follow.length; j++) {
                f_follow[j].setLongValue(0L);
                f_precede[j].setLongValue(0L);
                f_followerprecede[j].setLongValue(0L);
            }

            if (splits.length > 2 && !splits[2].isEmpty()) {
                // precede or follow or followerprecede
                String[] splits_ = splits[2].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 3 && !splits[3].isEmpty()) {
                // should be follow or followerprecede
                String[] splits_ = splits[3].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 4 && !splits[4].isEmpty()) {
                // should be followerprecede
                String[] splits_ = splits[4].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }

            writer_ngram.addDocument(doc);

            while (N.length <= n) {
                N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } });
                S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L });
            }

            if (num == 1L)
                N[n][1]++;
            else if (num == 2L)
                N[n][2]++;
            else if (num == 3L)
                N[n][3]++;
            else if (num == 4L)
                N[n][4]++;
            else
                N[n][5]++;
            N[n][0]++;
            S[n] += num;

        } catch (Exception e) {
            LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line,
                    ngram_joined_counts_file, c, e);
        }
    }

    writer_ngram.forceMergeDeletes();
    writer_ngram.commit();
    writer_ngram.close();

    StringBuilder b = new StringBuilder(String.format(
            "#%n# Number of times where an ngram occurred: %n#  at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n",
            N.length - 1));
    for (int n = 1; n < N.length; n++)
        b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ',')));
    for (int n = 1; n < S.length; n++)
        b.append(String.format("s%d=%d%n", n, S[n]));
    FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString());

}

From source file:ComLog.IOUtils.java

/**
 * Return an Iterator for the lines in a <code>Reader</code>.
 * <p>//from w w  w . j  a v  a  2 s . c om
 * <code>LineIterator</code> holds a reference to the open
 * <code>Reader</code> specified here. When you have finished with the
 * iterator you should close the reader to free internal resources.
 * This can be done by closing the reader directly, or by calling
 * {@link LineIterator#close()} or {@link LineIterator#closeQuietly(LineIterator)}.
 * <p>
 * The recommended usage pattern is:
 * <pre>
 * try {
 *   LineIterator it = IOUtils.lineIterator(reader);
 *   while (it.hasNext()) {
 *     String line = it.nextLine();
 *     /// do something with line
 *   }
 * } finally {
 *   IOUtils.closeQuietly(reader);
 * }
 * </pre>
 *
 * @param reader  the <code>Reader</code> to read from, not null
 * @return an Iterator of the lines in the reader, never null
 * @throws IllegalArgumentException if the reader is null
 * @since Commons IO 1.2
 */
public static LineIterator lineIterator(Reader reader) {
    return new LineIterator(reader);
}

From source file:ComLog.IOUtils.java

/**
 * Return an Iterator for the lines in an <code>InputStream</code>, using
 * the character encoding specified (or default encoding if null).
 * <p>/*from   ww w.  j  av a  2 s.  c om*/
 * <code>LineIterator</code> holds a reference to the open
 * <code>InputStream</code> specified here. When you have finished with
 * the iterator you should close the stream to free internal resources.
 * This can be done by closing the stream directly, or by calling
 * {@link LineIterator#close()} or {@link LineIterator#closeQuietly(LineIterator)}.
 * <p>
 * The recommended usage pattern is:
 * <pre>
 * try {
 *   LineIterator it = IOUtils.lineIterator(stream, "UTF-8");
 *   while (it.hasNext()) {
 *     String line = it.nextLine();
 *     /// do something with line
 *   }
 * } finally {
 *   IOUtils.closeQuietly(stream);
 * }
 * </pre>
 *
 * @param input  the <code>InputStream</code> to read from, not null
 * @param encoding  the encoding to use, null means platform default
 * @return an Iterator of the lines in the reader, never null
 * @throws IllegalArgumentException if the input is null
 * @throws IOException if an I/O error occurs, such as if the encoding is invalid
 * @since Commons IO 1.2
 */
public static LineIterator lineIterator(InputStream input, String encoding) throws IOException {
    Reader reader = null;
    if (encoding == null) {
        reader = new InputStreamReader(input);
    } else {
        reader = new InputStreamReader(input, encoding);
    }
    return new LineIterator(reader);
}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

public void create_vocabulary_index(File vocabulary_file) throws IOException {
    File index_dir = new File(_index_dir, "vocab");
    if (index_dir.exists()) {
        LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();/*from   ww w. j a v a2 s . c  om*/
        } else
            return;
    }
    index_dir.mkdirs();
    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(1024.0);
    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_vocab = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(vocabulary_file);
    if (vocabulary_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));
    Document doc = new Document();
    Field f_word = new StringField("word", "", Field.Store.YES);
    doc.add(f_word);
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 10000 == 0)
            LOG.info("Adding {}'th word.", c);
        String line = iter.next();
        try {
            String word = line.trim();
            f_word.setStringValue(word);
            writer_vocab.addDocument(doc);
        } catch (Exception e) {
            LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e);
        }
    }

    writer_vocab.forceMergeDeletes();
    writer_vocab.commit();
    writer_vocab.close();
}