Example usage for org.apache.commons.io LineIterator LineIterator

List of usage examples for org.apache.commons.io LineIterator LineIterator

Introduction

In this page you can find the example usage for org.apache.commons.io LineIterator LineIterator.

Prototype

public LineIterator(final Reader reader) throws IllegalArgumentException 

Source Link

Document

Constructs an iterator of the lines for a Reader.

Usage

From source file:de.tudarmstadt.lt.lm.service.BreakIteratorStringProvider.java

@Override
public List<String> splitSentences(String text, String language_code) throws Exception {
    LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200)));
    List<String> sentences = new ArrayList<String>();

    text = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(text, " ");

    for (LineIterator iter = new LineIterator(new StringReader(text)); iter.hasNext();) {
        String line = iter.nextLine();
        BreakIterator sentence_bounds = BreakIterator.getSentenceInstance(LocaleUtils.toLocale(language_code));
        sentence_bounds.setText(line);//  www . j a v  a2s . c om
        int begin_s = sentence_bounds.first();
        for (int end_s = sentence_bounds
                .next(); end_s != BreakIterator.DONE; begin_s = end_s, end_s = sentence_bounds.next()) {

            String sentence = de.tudarmstadt.lt.utilities.StringUtils.trim(line.substring(begin_s, end_s));
            if (sentence.isEmpty())
                continue;
            sentences.add(sentence);
            LOG.trace(String.format("Current sentence: %s", StringUtils.abbreviate(sentence, 200)));
        }
    }
    LOG.trace(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200),
            sentences.size()));
    return sentences;
}

From source file:mitm.common.dlp.impl.FileWordSkipper.java

private void loadFile() throws IOException {
    if (!file.exists()) {
        throw new IOException("File " + file.getAbsolutePath() + " does not exist.");
    }/*  www .  j av a 2s  .  com*/

    /*
     * Read the file using a Unicode aware reader
     */
    UnicodeReader unicodeReader = new UnicodeReader(new FileInputStream(file), CharEncoding.UTF_8);

    LineIterator lineIterator = new LineIterator(unicodeReader);

    while (lineIterator.hasNext()) {
        String line = StringUtils.trimToNull(lineIterator.nextLine());

        if (line == null || line.startsWith("#")) {
            continue;
        }

        words.add(line.toLowerCase());
    }
}

From source file:de.tudarmstadt.lt.lm.service.PreTokenizedStringProvider.java

@Override
public List<String> splitSentences(String text, String language_code) throws Exception {
    LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200)));
    List<String> sentences = new ArrayList<String>();

    text = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(text, " ");

    for (LineIterator iter = new LineIterator(new StringReader(text)); iter.hasNext();) {
        String line = iter.nextLine();
        String sentence = de.tudarmstadt.lt.utilities.StringUtils.trim(line);
        if (sentence.isEmpty())
            continue;
        sentences.add(sentence);//from   w  w w . j ava  2  s .co  m
        LOG.trace(String.format("Current sentence: %s", StringUtils.abbreviate(sentence, 200)));
    }
    LOG.debug(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200),
            sentences.size()));
    return sentences;
}

From source file:com.mewmew.fairy.v1.json.BaseJsonSpell.java

@Override
public void process(InputStream in, OutputStream out) {
    try {//w w  w.ja v a 2  s . c  o m
        Iterator<Map<String, Object>> iterator = null;

        if (inputType != null) {
            JsonRegistry registry = new JsonRegistry("inputs.json");
            Map<String, Map<String, Object>> map = registry.loadAsMap("name");
            Map<String, Object> json = map.get(inputType);
            Spell spell = Pipe.toSpell(((ArrayList<String>) json.get("pipe")).toArray(new String[0]));
            if (spell instanceof Source) {
                // TODO : do some magic type variable checking here.
                iterator = ((Source) spell).createIterator(in);
            }
            // TODO : any pipe that is a Sink<Map<String, Object>> should also work by using PipedOutputStream chaining... 
        }

        if (iterator == null) {
            if (streaming) {
                // HACK : ugly workaround , replace me
                iterator = (Iterator<Map<String, Object>>) (Object) new JsonArrayIterator<Map>(
                        factory.createJsonParser(in), Map.class);
            } else {
                iterator = new SimpleJsonIterator(new LineIterator(new InputStreamReader(in)));
            }
        }

        pipe = createPipe();
        output = createOutput(out);
        PipeUtil.process(iterator, pipe, output);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:eu.eexcess.europeana.clickstream.EuropeanaQueryCollector.java

private void processDirectory(File dir, PrintWriter writer, QueryParser queryParser,
        Set<String> uniqueQueryCollector)
        throws IOException, FileNotFoundException, UnsupportedEncodingException {
    for (File file : dir.listFiles()) {
        if (!file.getName().endsWith(".gz")) {
            continue;
        }//from   w  ww. j a  v  a2s .c o  m
        System.err.println("Parsing file: " + file);

        GZIPInputStream inputStream = new GZIPInputStream(new FileInputStream(file));
        LineIterator iterator = new LineIterator(new InputStreamReader(inputStream, "UTF-8"));
        while (iterator.hasNext()) {
            String line = iterator.nextLine();
            if (line.contains(QUERY_PREFIX) && !line.contains("query=DATA_PROVIDER")
                    && !line.contains("qf=DATA_PROVIDER") && !line.contains("bot") && !line.contains("slurp")
                    && !line.contains("spider")) {
                int start = line.indexOf(QUERY_PREFIX), end = line.indexOf("\"", start),
                        end2 = line.indexOf("&", start);
                if (end2 < end && end2 > 0) {
                    end = end2;
                }
                if (end < 0) {
                    end = line.length();
                }
                try {
                    String query = URLDecoder.decode(line.substring(start + QUERY_PREFIX.length(), end),
                            "UTF-8");
                    if (!query.contains(":")) {
                        Query parsedQuery = queryParser.parse(query);
                        if (parsedQuery instanceof BooleanQuery) {
                            List<BooleanClause> clauses = ((BooleanQuery) parsedQuery).clauses();
                            if (clauses != null) {
                                List<String> queryTerms = new ArrayList<String>();
                                boolean onlyTermQueries = true;
                                for (BooleanClause clause : clauses) {
                                    if (!(clause.getQuery() instanceof TermQuery)) {
                                        // there is at lease a single non term query
                                        onlyTermQueries = false;
                                        break;
                                    } else {
                                        TermQuery termQuery = (TermQuery) clause.getQuery();
                                        if (termQuery.getTerm().field().equals(DEFAULT_FIELD)) {
                                            queryTerms.add(termQuery.getTerm().text());
                                        }
                                    }
                                }

                                if (onlyTermQueries && queryTerms.size() == 2) {
                                    StringBuilder builder = new StringBuilder();
                                    for (String e : new TreeSet<String>(queryTerms)) {
                                        if (builder.length() > 0) {
                                            builder.append('\t');
                                        }
                                        builder.append(e);
                                    }
                                    // queryTerms.stream().map( (a, b) -> {b.append(a)});
                                    String normalisedQuery = builder.toString();
                                    if (uniqueQueryCollector.add(normalisedQuery)) {
                                        StringBuilder b = new StringBuilder();
                                        for (String e : queryTerms) {
                                            if (b.length() > 0) {
                                                b.append('\t');
                                            }
                                            b.append(e);
                                        }
                                        String queryInNaturalSequence = b.toString();
                                        writer.println(queryInNaturalSequence);
                                        System.out.println(queryInNaturalSequence);
                                    }
                                }
                            }
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        iterator.close();
        inputStream.close();
    }
}

From source file:de.tudarmstadt.lt.lm.service.LtSegProvider.java

@Override
public List<String> splitSentences(String text, String language_code) throws Exception {
    LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200)));
    List<String> sentences = new ArrayList<String>();

    if (Properties.onedocperline()) {
        LineIterator liter = new LineIterator(new StringReader(text));
        for (String line; (line = liter.hasNext() ? liter.next() : null) != null;)
            split_and_add_sentences(line, sentences);
    } else {//from ww  w.ja  v  a2  s . c om
        split_and_add_sentences(text, sentences);
    }

    LOG.trace(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200),
            sentences.size()));
    return sentences;
}

From source file:eu.eexcess.domaindetection.wordnet.WordnetDomainsReader.java

public void read(File file) throws IOException {
    System.out.println("Read in the original WordNet Domains file: " + file);
    LineIterator iterator = new LineIterator(new FileReader(file));
    while (iterator.hasNext()) {
        String line = iterator.nextLine();
        String[] tokens = line.split("[\t\\ ]");
        String synset = tokens[0];
        for (int i = 1; i < tokens.length; i++) {
            DomainAssignment assignment = new DomainAssignment(tokens[i], 1);
            Set<DomainAssignment> domains = synsetToDomains.get(synset);
            if (domains == null) {
                domains = new TreeSet<DomainAssignment>();
                synsetToDomains.put(synset, domains);
            }//  w  w w .ja v a  2 s  .co  m
            domains.add(assignment);
        }
    }
    iterator.close();
}

From source file:edu.cornell.med.icb.goby.modes.TrimMode.java

@Override
public void execute() throws IOException {

    ReadsReader reader = null;/*from  w ww  . ja  va2  s.c  o m*/
    ReadsWriter writer = null;
    final ProgressLogger progress = new ProgressLogger(LOG);
    try {

        reader = new ReadsReader(inputFilename);
        final LineIterator lines = new LineIterator(new FileReader(adapterFilename));
        final ObjectArrayList<MutableString> adapterList = new ObjectArrayList<MutableString>();
        while (lines.hasNext()) {
            final String next = lines.nextLine();
            adapterList.add(new MutableString(next));
        }
        final MutableString[] adapters;
        if (complementAdapters) {
            adapters = addComplementAdapters(adapterList);
        } else {
            adapters = adapterList.toArray(new MutableString[adapterList.size()]);
        }
        progress.start();
        writer = new ReadsWriterImpl(new FileOutputStream(outputFilename));

        final ByteArrayList newQualScores = new ByteArrayList();
        final ByteArrayList newPairQualScores = new ByteArrayList();
        final MutableString sequence = new MutableString();
        final MutableString sequencePair = new MutableString();

        for (final Reads.ReadEntry entry : reader) {
            //      observe(counters, entry.getSequence(), entry.getReadIndex());
            ReadsReader.decodeSequence(entry, sequence);

            final ByteString qualityScores = entry.getQualityScores();
            newQualScores.clear();

            final MutableString seq1 = trim(adapters, newQualScores, sequence, qualityScores);
            MutableString pairSeq = null;

            numSequencesInInput++;
            if (entry.hasSequencePair()) {
                newPairQualScores.clear();

                ReadsReader.decodeSequence(entry, sequencePair, true);

                final ByteString pairQualityScores = entry.getQualityScoresPair();
                pairSeq = trim(adapters, newPairQualScores, sequencePair, pairQualityScores);
                numSequencesInInput++;
            }

            //    System.out.printf(">seq%n%s%n", c);
            Reads.ReadEntry.Builder builder = Reads.ReadEntry.newBuilder();
            builder = builder.mergeFrom(entry).setSequence(ReadsWriterImpl.encodeSequence(seq1, buffer))
                    .setReadLength(seq1.length());
            if (sequence.length() != seq1.length()) {
                numTrimmed++;
                final byte[] bytes1 = newQualScores.toByteArray();
                builder = builder.setQualityScores(ByteString.copyFrom(bytes1));
                assert builder.getQualityScores().size() == builder.getSequence()
                        .size() : "sequence length and quality scores must match.";
            }

            if (entry.hasSequencePair()) {
                builder = builder.mergeFrom(entry)
                        .setSequencePair(ReadsWriterImpl.encodeSequence(pairSeq, buffer))
                        .setReadLength(pairSeq.length());

                if (sequencePair.length() != pairSeq.length()) {
                    numTrimmed++;
                    builder = builder
                            .setQualityScoresPair(ByteString.copyFrom(newPairQualScores.toByteArray()));
                    assert builder.getQualityScoresPair().size() == builder.getSequencePair()
                            .size() : "sequence length and quality scores must match.";

                }
            }
            if (seq1.length() > 0 || sequencePair.length() > 0) {
                // some sequence must remain to append to the output:
                writer.appendEntry(builder);
            }
            progress.lightUpdate();
        }
        progress.stop();

        final int numSequencesTrimmed = numTrimmed;
        double percent = 100d * numSequencesTrimmed;
        percent /= numSequencesInInput;
        System.out.printf(
                "Number of reads trimmed %d (%g %% of input sequences), including: %n" + "left: %d (%g%%)%n"
                        + "right: %d (%g%%), %n" + "fully contained: %d (%g%%)%n",
                numSequencesTrimmed, percent, numTrimmedLeft, percent(numTrimmedLeft, numSequencesTrimmed),
                numTrimmedRight, percent(numTrimmedRight, numSequencesTrimmed), numContained,
                percent(numContained, numSequencesTrimmed));
        System.out.flush();
    } finally {
        if (writer != null) {
            writer.close();
        }

    }

    progress.stop();
}

From source file:fr.aliacom.obm.common.calendar.MailSendTest.java

private void assertTextCalendarContentTransferEncodingIsCorrect(CalendarEncoding encoding) throws Exception {
    String icsContent = IOUtils.toString(getClass().getResourceAsStream("meetingWithOneAttendee.ics"));
    EventMail eventMail = new EventMail(new InternetAddress("sender@test"),
            ImmutableList.of(newAttendee("attendee1")), SUBJECT, BODY_TEXT, BODY_HTML, icsContent, ICS_METHOD,
            encoding);//from   w w  w.  ja v  a2  s . c  o  m
    String content = writeEventMail(eventMail);
    LineIterator lineIterator = new LineIterator(new StringReader(content));
    boolean textCalendarFound = false;

    while (lineIterator.hasNext()) {
        if (lineIterator.next().contains("Content-Type: text/calendar")) {
            textCalendarFound = true;
            break;
        }
    }

    assertThat(textCalendarFound).isTrue();
    assertThat(lineIterator.next()).contains("Content-Transfer-Encoding: " + encoding.getValue());
}

From source file:com.mewmew.fairy.v1.book.Cut.java

public Iterator<Map<String, Object>> createIterator(InputStream in) {
    return new MappingIterator((Iterator<String>) new LineIterator(new InputStreamReader(in)), this);
}