List of usage examples for org.apache.commons.io LineIterator LineIterator
public LineIterator(final Reader reader) throws IllegalArgumentException
Reader
. From source file:de.tudarmstadt.lt.lm.service.BreakIteratorStringProvider.java
@Override public List<String> splitSentences(String text, String language_code) throws Exception { LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200))); List<String> sentences = new ArrayList<String>(); text = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(text, " "); for (LineIterator iter = new LineIterator(new StringReader(text)); iter.hasNext();) { String line = iter.nextLine(); BreakIterator sentence_bounds = BreakIterator.getSentenceInstance(LocaleUtils.toLocale(language_code)); sentence_bounds.setText(line);// www . j a v a2s . c om int begin_s = sentence_bounds.first(); for (int end_s = sentence_bounds .next(); end_s != BreakIterator.DONE; begin_s = end_s, end_s = sentence_bounds.next()) { String sentence = de.tudarmstadt.lt.utilities.StringUtils.trim(line.substring(begin_s, end_s)); if (sentence.isEmpty()) continue; sentences.add(sentence); LOG.trace(String.format("Current sentence: %s", StringUtils.abbreviate(sentence, 200))); } } LOG.trace(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200), sentences.size())); return sentences; }
From source file:mitm.common.dlp.impl.FileWordSkipper.java
private void loadFile() throws IOException { if (!file.exists()) { throw new IOException("File " + file.getAbsolutePath() + " does not exist."); }/* www . j av a 2s . com*/ /* * Read the file using a Unicode aware reader */ UnicodeReader unicodeReader = new UnicodeReader(new FileInputStream(file), CharEncoding.UTF_8); LineIterator lineIterator = new LineIterator(unicodeReader); while (lineIterator.hasNext()) { String line = StringUtils.trimToNull(lineIterator.nextLine()); if (line == null || line.startsWith("#")) { continue; } words.add(line.toLowerCase()); } }
From source file:de.tudarmstadt.lt.lm.service.PreTokenizedStringProvider.java
@Override public List<String> splitSentences(String text, String language_code) throws Exception { LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200))); List<String> sentences = new ArrayList<String>(); text = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(text, " "); for (LineIterator iter = new LineIterator(new StringReader(text)); iter.hasNext();) { String line = iter.nextLine(); String sentence = de.tudarmstadt.lt.utilities.StringUtils.trim(line); if (sentence.isEmpty()) continue; sentences.add(sentence);//from w w w . j ava 2 s .co m LOG.trace(String.format("Current sentence: %s", StringUtils.abbreviate(sentence, 200))); } LOG.debug(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200), sentences.size())); return sentences; }
From source file:com.mewmew.fairy.v1.json.BaseJsonSpell.java
@Override public void process(InputStream in, OutputStream out) { try {//w w w.ja v a 2 s . c o m Iterator<Map<String, Object>> iterator = null; if (inputType != null) { JsonRegistry registry = new JsonRegistry("inputs.json"); Map<String, Map<String, Object>> map = registry.loadAsMap("name"); Map<String, Object> json = map.get(inputType); Spell spell = Pipe.toSpell(((ArrayList<String>) json.get("pipe")).toArray(new String[0])); if (spell instanceof Source) { // TODO : do some magic type variable checking here. iterator = ((Source) spell).createIterator(in); } // TODO : any pipe that is a Sink<Map<String, Object>> should also work by using PipedOutputStream chaining... } if (iterator == null) { if (streaming) { // HACK : ugly workaround , replace me iterator = (Iterator<Map<String, Object>>) (Object) new JsonArrayIterator<Map>( factory.createJsonParser(in), Map.class); } else { iterator = new SimpleJsonIterator(new LineIterator(new InputStreamReader(in))); } } pipe = createPipe(); output = createOutput(out); PipeUtil.process(iterator, pipe, output); } catch (IOException e) { throw new RuntimeException(e); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:eu.eexcess.europeana.clickstream.EuropeanaQueryCollector.java
private void processDirectory(File dir, PrintWriter writer, QueryParser queryParser, Set<String> uniqueQueryCollector) throws IOException, FileNotFoundException, UnsupportedEncodingException { for (File file : dir.listFiles()) { if (!file.getName().endsWith(".gz")) { continue; }//from w ww. j a v a2s .c o m System.err.println("Parsing file: " + file); GZIPInputStream inputStream = new GZIPInputStream(new FileInputStream(file)); LineIterator iterator = new LineIterator(new InputStreamReader(inputStream, "UTF-8")); while (iterator.hasNext()) { String line = iterator.nextLine(); if (line.contains(QUERY_PREFIX) && !line.contains("query=DATA_PROVIDER") && !line.contains("qf=DATA_PROVIDER") && !line.contains("bot") && !line.contains("slurp") && !line.contains("spider")) { int start = line.indexOf(QUERY_PREFIX), end = line.indexOf("\"", start), end2 = line.indexOf("&", start); if (end2 < end && end2 > 0) { end = end2; } if (end < 0) { end = line.length(); } try { String query = URLDecoder.decode(line.substring(start + QUERY_PREFIX.length(), end), "UTF-8"); if (!query.contains(":")) { Query parsedQuery = queryParser.parse(query); if (parsedQuery instanceof BooleanQuery) { List<BooleanClause> clauses = ((BooleanQuery) parsedQuery).clauses(); if (clauses != null) { List<String> queryTerms = new ArrayList<String>(); boolean onlyTermQueries = true; for (BooleanClause clause : clauses) { if (!(clause.getQuery() instanceof TermQuery)) { // there is at lease a single non term query onlyTermQueries = false; break; } else { TermQuery termQuery = (TermQuery) clause.getQuery(); if (termQuery.getTerm().field().equals(DEFAULT_FIELD)) { queryTerms.add(termQuery.getTerm().text()); } } } if (onlyTermQueries && queryTerms.size() == 2) { StringBuilder builder = new StringBuilder(); for (String e : new TreeSet<String>(queryTerms)) { if (builder.length() > 0) { builder.append('\t'); } builder.append(e); } // queryTerms.stream().map( (a, b) -> {b.append(a)}); String normalisedQuery = builder.toString(); if (uniqueQueryCollector.add(normalisedQuery)) { StringBuilder b = new StringBuilder(); for (String e : queryTerms) { if (b.length() > 0) { b.append('\t'); } b.append(e); } String queryInNaturalSequence = b.toString(); writer.println(queryInNaturalSequence); System.out.println(queryInNaturalSequence); } } } } } } catch (Exception e) { e.printStackTrace(); } } } iterator.close(); inputStream.close(); } }
From source file:de.tudarmstadt.lt.lm.service.LtSegProvider.java
@Override public List<String> splitSentences(String text, String language_code) throws Exception { LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200))); List<String> sentences = new ArrayList<String>(); if (Properties.onedocperline()) { LineIterator liter = new LineIterator(new StringReader(text)); for (String line; (line = liter.hasNext() ? liter.next() : null) != null;) split_and_add_sentences(line, sentences); } else {//from ww w.ja v a2 s . c om split_and_add_sentences(text, sentences); } LOG.trace(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200), sentences.size())); return sentences; }
From source file:eu.eexcess.domaindetection.wordnet.WordnetDomainsReader.java
public void read(File file) throws IOException { System.out.println("Read in the original WordNet Domains file: " + file); LineIterator iterator = new LineIterator(new FileReader(file)); while (iterator.hasNext()) { String line = iterator.nextLine(); String[] tokens = line.split("[\t\\ ]"); String synset = tokens[0]; for (int i = 1; i < tokens.length; i++) { DomainAssignment assignment = new DomainAssignment(tokens[i], 1); Set<DomainAssignment> domains = synsetToDomains.get(synset); if (domains == null) { domains = new TreeSet<DomainAssignment>(); synsetToDomains.put(synset, domains); }// w w w .ja v a 2 s .co m domains.add(assignment); } } iterator.close(); }
From source file:edu.cornell.med.icb.goby.modes.TrimMode.java
@Override public void execute() throws IOException { ReadsReader reader = null;/*from w ww . ja va2 s.c o m*/ ReadsWriter writer = null; final ProgressLogger progress = new ProgressLogger(LOG); try { reader = new ReadsReader(inputFilename); final LineIterator lines = new LineIterator(new FileReader(adapterFilename)); final ObjectArrayList<MutableString> adapterList = new ObjectArrayList<MutableString>(); while (lines.hasNext()) { final String next = lines.nextLine(); adapterList.add(new MutableString(next)); } final MutableString[] adapters; if (complementAdapters) { adapters = addComplementAdapters(adapterList); } else { adapters = adapterList.toArray(new MutableString[adapterList.size()]); } progress.start(); writer = new ReadsWriterImpl(new FileOutputStream(outputFilename)); final ByteArrayList newQualScores = new ByteArrayList(); final ByteArrayList newPairQualScores = new ByteArrayList(); final MutableString sequence = new MutableString(); final MutableString sequencePair = new MutableString(); for (final Reads.ReadEntry entry : reader) { // observe(counters, entry.getSequence(), entry.getReadIndex()); ReadsReader.decodeSequence(entry, sequence); final ByteString qualityScores = entry.getQualityScores(); newQualScores.clear(); final MutableString seq1 = trim(adapters, newQualScores, sequence, qualityScores); MutableString pairSeq = null; numSequencesInInput++; if (entry.hasSequencePair()) { newPairQualScores.clear(); ReadsReader.decodeSequence(entry, sequencePair, true); final ByteString pairQualityScores = entry.getQualityScoresPair(); pairSeq = trim(adapters, newPairQualScores, sequencePair, pairQualityScores); numSequencesInInput++; } // System.out.printf(">seq%n%s%n", c); Reads.ReadEntry.Builder builder = Reads.ReadEntry.newBuilder(); builder = builder.mergeFrom(entry).setSequence(ReadsWriterImpl.encodeSequence(seq1, buffer)) .setReadLength(seq1.length()); if (sequence.length() != seq1.length()) { numTrimmed++; final byte[] bytes1 = newQualScores.toByteArray(); builder = builder.setQualityScores(ByteString.copyFrom(bytes1)); assert builder.getQualityScores().size() == builder.getSequence() .size() : "sequence length and quality scores must match."; } if (entry.hasSequencePair()) { builder = builder.mergeFrom(entry) .setSequencePair(ReadsWriterImpl.encodeSequence(pairSeq, buffer)) .setReadLength(pairSeq.length()); if (sequencePair.length() != pairSeq.length()) { numTrimmed++; builder = builder .setQualityScoresPair(ByteString.copyFrom(newPairQualScores.toByteArray())); assert builder.getQualityScoresPair().size() == builder.getSequencePair() .size() : "sequence length and quality scores must match."; } } if (seq1.length() > 0 || sequencePair.length() > 0) { // some sequence must remain to append to the output: writer.appendEntry(builder); } progress.lightUpdate(); } progress.stop(); final int numSequencesTrimmed = numTrimmed; double percent = 100d * numSequencesTrimmed; percent /= numSequencesInInput; System.out.printf( "Number of reads trimmed %d (%g %% of input sequences), including: %n" + "left: %d (%g%%)%n" + "right: %d (%g%%), %n" + "fully contained: %d (%g%%)%n", numSequencesTrimmed, percent, numTrimmedLeft, percent(numTrimmedLeft, numSequencesTrimmed), numTrimmedRight, percent(numTrimmedRight, numSequencesTrimmed), numContained, percent(numContained, numSequencesTrimmed)); System.out.flush(); } finally { if (writer != null) { writer.close(); } } progress.stop(); }
From source file:fr.aliacom.obm.common.calendar.MailSendTest.java
private void assertTextCalendarContentTransferEncodingIsCorrect(CalendarEncoding encoding) throws Exception { String icsContent = IOUtils.toString(getClass().getResourceAsStream("meetingWithOneAttendee.ics")); EventMail eventMail = new EventMail(new InternetAddress("sender@test"), ImmutableList.of(newAttendee("attendee1")), SUBJECT, BODY_TEXT, BODY_HTML, icsContent, ICS_METHOD, encoding);//from w w w. ja v a2 s . c o m String content = writeEventMail(eventMail); LineIterator lineIterator = new LineIterator(new StringReader(content)); boolean textCalendarFound = false; while (lineIterator.hasNext()) { if (lineIterator.next().contains("Content-Type: text/calendar")) { textCalendarFound = true; break; } } assertThat(textCalendarFound).isTrue(); assertThat(lineIterator.next()).contains("Content-Transfer-Encoding: " + encoding.getValue()); }
From source file:com.mewmew.fairy.v1.book.Cut.java
public Iterator<Map<String, Object>> createIterator(InputStream in) { return new MappingIterator((Iterator<String>) new LineIterator(new InputStreamReader(in)), this); }