List of usage examples for org.apache.commons.io LineIterator LineIterator
public LineIterator(final Reader reader) throws IllegalArgumentException
Reader
. From source file:de.tudarmstadt.lt.lm.app.SentPerp.java
void run(Reader r) { long l = 0;/*from w w w . j a v a 2 s . c o m*/ for (LineIterator liter = new LineIterator(r); liter.hasNext();) { if (++l % 5000 == 0) LOG.info("{}: processing line {}.", _rmi_string, l); String line = liter.next(); if (line.trim().isEmpty()) continue; List<String> sentences; try { sentences = _lm_prvdr.splitSentences(line); } catch (Exception e) { LOG.error("{}: Could not split sentences from line {}: '{}'.", _rmi_string, l, StringUtils.abbreviate(line, 100), e); continue; } for (String sentence : sentences) { double p_log10_sent = 0d; double num_words = 0d; List<String> tokens; List<String>[] ngrams; try { tokens = _lm_prvdr.tokenizeSentence(sentence); if (tokens == null || tokens.isEmpty()) continue; ngrams = _lm_prvdr.getNgramSequence(tokens); if (ngrams == null || ngrams.length == 0) continue; } catch (Exception e) { LOG.error("{}: Could not get ngrams from line {}: '{}'.", _rmi_string, l, StringUtils.abbreviate(line, 100), e); continue; } for (List<String> ngram : ngrams) { if (ngram.isEmpty()) continue; _num_ngrams++; try { if (_lm_prvdr_oovref.ngramContainsOOV(ngram)) { _oov_ngrams++; if (_lm_prvdr_oovref.ngramEndsWithOOV(ngram)) { _oov_terms++; if (_no_oov) continue; } } double log10prob = _lm_prvdr.getNgramLog10Probability(ngram); p_log10_sent += log10prob; num_words++; if (log10prob < _min_prob) { _min_prob = log10prob; _min_ngram = ngram; } if (log10prob > _max_prob) { _max_prob = log10prob; _max_ngram = ngram; } } catch (Exception e) { LOG.error("{}: Could not add ngram '{}' to perplexity.", _rmi_string, ngram); continue; } } if (num_words == 0) continue; p_log10_sent = p_log10_sent / num_words;// (double)tokens.size(); _num_sents++; _sum_log10_prob_sents += p_log10_sent; } } }
From source file:de.tudarmstadt.lt.seg.app.Segmenter.java
private void run_sequential_line() throws Exception { ISentenceSplitter sentenceSplitter = newSentenceSplitter(); ITokenizer tokenizer = newTokenizer(); InputStream in = System.in; if (!"-".equals(_filename_in)) in = new FileInputStream(_filename_in); LineIterator liter = new LineIterator( new BufferedReader(new InputStreamReader(in, Charset.defaultCharset()))); OutputStream out = System.out; if (!"-".equals(_filename_out)) out = new FileOutputStream(_filename_out); PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset())); for (long lc = 0; liter.hasNext();) { if (++lc % 1000 == 0) System.err.format("Processing line %d ('%s')%n", lc, _filename_in); String l = liter.next().replace("\\t", "\t").replace("\\n", "\n"); split_and_tokenize(new StringReader(l), String.format("%s:%d", _filename_in, lc), sentenceSplitter, tokenizer, _level_filter, _level_normalize, _merge_types, _merge_tokens, _separator_sentence, _separator_token, _separator_desc, w); }//from w w w. j av a 2s .c o m }
From source file:de.tudarmstadt.lt.lm.app.PerplexityClient.java
@SuppressWarnings("unchecked") void run(Reader r) { long l = 0;//from w ww . jav a 2 s.c o m for (LineIterator liter = new LineIterator(r); liter.hasNext();) { if (++l % 5000 == 0) LOG.info("{}: processing line {}.", _rmi_string, l); String line = liter.next(); if (line.trim().isEmpty()) continue; List<String>[] ngrams; try { if (_one_ngram_per_line) ngrams = new List[] { Arrays.asList(line.split(" ")) }; else ngrams = _lm_prvdr.getNgrams(line); if (ngrams == null || ngrams.length == 0) continue; } catch (Exception e) { LOG.error("{}: Could not get ngrams from line {}: '{}'.", _rmi_string, l, StringUtils.abbreviate(line, 100), e); continue; } for (List<String> ngram : ngrams) { if (ngram.isEmpty()) continue; _num_ngrams++; try { boolean oov = false; if (_lm_prvdr.ngramContainsOOV(ngram)) { _oov_ngrams++; if (_lm_prvdr.ngramEndsWithOOV(ngram)) { _oov_terms++; oov = true; } } if (_lm_prvdr_oovref.ngramContainsOOV(ngram)) { _oovreflm_oov_ngrams++; if (_lm_prvdr_oovref.ngramEndsWithOOV(ngram)) { _oovreflm_oov_terms++; if (_no_oov_reflm || (_no_oov && oov)) continue; } } double log10prob = _perplexity_all.addLog10Prob(ngram); _perplexity_file.addLog10Prob(ngram); if (log10prob < _min_prob) { _min_prob = log10prob; _min_ngram = ngram; } if (log10prob > _max_prob) { _max_prob = log10prob; _max_ngram = ngram; } } catch (Exception e) { LOG.error("{}: Could not add ngram '{}' to perplexity.", _rmi_string, ngram); continue; } } } }
From source file:com.github.anba.es6draft.test262.Test262Info.java
private void readTagged(String descriptor, boolean lenient) throws MalformedDataException { assert descriptor != null && !descriptor.isEmpty(); for (LineIterator lines = new LineIterator(new StringReader(descriptor)); lines.hasNext();) { String line = lines.next(); Matcher m = tags.matcher(line); if (m.matches()) { String type = m.group(1); String val = m.group(2); switch (type) { case "description": this.description = requireNonNull(val, "description must not be null"); break; case "noStrict": requireNull(val); this.noStrict = true; break; case "onlyStrict": requireNull(val); this.onlyStrict = true; break; case "negative": this.negative = true; this.errorType = Objects.toString(val, this.errorType); break; case "hostObject": case "reviewers": case "generator": case "verbatim": case "noHelpers": case "bestPractice": case "implDependent": case "author": // ignore for now break; // legacy case "strict_mode_negative": this.negative = true; this.onlyStrict = true; this.errorType = Objects.toString(val, this.errorType); break; case "strict_only": requireNull(val); this.onlyStrict = true; break; case "errortype": this.errorType = requireNonNull(val, "error-type must not be null"); break; case "assertion": case "section": case "path": case "comment": case "name": // ignore for now break; default: // error if (lenient) { break; }//from w w w. j ava 2s.c om throw new MalformedDataException(String.format("unhandled type '%s' (%s)\n", type, this)); } } } }
From source file:dk.netarkivet.harvester.datamodel.PartialHarvest.java
/** * This method is a duplicate of the addSeeds method but for seedsFile parameter * * @param seedsFile a newline-separated File containing the seeds to be added * @param templateName the name of the template to be used * @param maxBytes Maximum number of bytes to harvest per domain * @param maxObjects Maximum number of objects to harvest per domain *//* w w w . j a v a2 s . c o m*/ public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { ArgumentNotValid.checkNotNull(seedsFile, "seeds"); ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); if (!TemplateDAO.getInstance().exists(templateName)) { throw new UnknownID("No such template: " + templateName); } Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); StringBuilder invalidMessage = new StringBuilder( "Unable to create an event harvest.\n" + "The following seeds are invalid:\n"); boolean valid = true; // validate all the seeds in the file // those accepted are entered into the acceptedSeeds datastructure // Iterate through the contents of the file LineIterator seedIterator = null; try { seedIterator = new LineIterator(new FileReader(seedsFile)); while (seedIterator.hasNext()) { String seed = seedIterator.next(); boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); if (!seedValid) { valid = false; } } } catch (IOException e) { throw new IOFailure("Unable to process seedsfile ", e); } finally { LineIterator.closeQuietly(seedIterator); } if (!valid) { throw new ArgumentNotValid(invalidMessage.toString()); } addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); }
From source file:edu.cornell.med.icb.goby.modes.EmpiricalPMode.java
private int countLines(String inputFilename) throws FileNotFoundException { int lineCount = 0; LineIterator it = new LineIterator(new FileReader(inputFilename)); while (it.hasNext()) { Object next = it.next();/*www . java 2 s . c o m*/ lineCount++; } it.close(); return lineCount; }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*w w w. j ava2s . com*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }
From source file:ComLog.IOUtils.java
/** * Return an Iterator for the lines in a <code>Reader</code>. * <p>//from w w w . j a v a 2 s . c om * <code>LineIterator</code> holds a reference to the open * <code>Reader</code> specified here. When you have finished with the * iterator you should close the reader to free internal resources. * This can be done by closing the reader directly, or by calling * {@link LineIterator#close()} or {@link LineIterator#closeQuietly(LineIterator)}. * <p> * The recommended usage pattern is: * <pre> * try { * LineIterator it = IOUtils.lineIterator(reader); * while (it.hasNext()) { * String line = it.nextLine(); * /// do something with line * } * } finally { * IOUtils.closeQuietly(reader); * } * </pre> * * @param reader the <code>Reader</code> to read from, not null * @return an Iterator of the lines in the reader, never null * @throws IllegalArgumentException if the reader is null * @since Commons IO 1.2 */ public static LineIterator lineIterator(Reader reader) { return new LineIterator(reader); }
From source file:ComLog.IOUtils.java
/** * Return an Iterator for the lines in an <code>InputStream</code>, using * the character encoding specified (or default encoding if null). * <p>/*from ww w. j av a 2 s. c om*/ * <code>LineIterator</code> holds a reference to the open * <code>InputStream</code> specified here. When you have finished with * the iterator you should close the stream to free internal resources. * This can be done by closing the stream directly, or by calling * {@link LineIterator#close()} or {@link LineIterator#closeQuietly(LineIterator)}. * <p> * The recommended usage pattern is: * <pre> * try { * LineIterator it = IOUtils.lineIterator(stream, "UTF-8"); * while (it.hasNext()) { * String line = it.nextLine(); * /// do something with line * } * } finally { * IOUtils.closeQuietly(stream); * } * </pre> * * @param input the <code>InputStream</code> to read from, not null * @param encoding the encoding to use, null means platform default * @return an Iterator of the lines in the reader, never null * @throws IllegalArgumentException if the input is null * @throws IOException if an I/O error occurs, such as if the encoding is invalid * @since Commons IO 1.2 */ public static LineIterator lineIterator(InputStream input, String encoding) throws IOException { Reader reader = null; if (encoding == null) { reader = new InputStreamReader(input); } else { reader = new InputStreamReader(input, encoding); } return new LineIterator(reader); }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
public void create_vocabulary_index(File vocabulary_file) throws IOException { File index_dir = new File(_index_dir, "vocab"); if (index_dir.exists()) { LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*from ww w. j a v a2 s . c om*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(1024.0); Directory directory = new MMapDirectory(index_dir); IndexWriter writer_vocab = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(vocabulary_file); if (vocabulary_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_word = new StringField("word", "", Field.Store.YES); doc.add(f_word); long c = 0; while (iter.hasNext()) { if (++c % 10000 == 0) LOG.info("Adding {}'th word.", c); String line = iter.next(); try { String word = line.trim(); f_word.setStringValue(word); writer_vocab.addDocument(doc); } catch (Exception e) { LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e); } } writer_vocab.forceMergeDeletes(); writer_vocab.commit(); writer_vocab.close(); }