Example usage for org.apache.commons.io IOUtils lineIterator

List of usage examples for org.apache.commons.io IOUtils lineIterator

Introduction

In this page you can find the example usage for org.apache.commons.io IOUtils lineIterator.

Prototype

public static LineIterator lineIterator(InputStream input, String encoding) throws IOException 

Source Link

Document

Return an Iterator for the lines in an InputStream, using the character encoding specified (or default encoding if null).

Usage

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java

public static void main(String[] args) throws Exception {
    File crawledPagesFolder = new File(args[0]);
    if (!crawledPagesFolder.exists()) {
        crawledPagesFolder.mkdirs();//from   ww  w.j  a  va 2s  . c om
    }

    File outputFolder = new File(args[1]);
    if (!outputFolder.exists()) {
        outputFolder.mkdirs();
    }

    // read links from text file
    final String urlsResourceName = "roomfordebate-urls.txt";

    InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName);

    if (urlsStream == null) {
        throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath");
    }

    // read list of urls
    List<String> urls = new ArrayList<>();
    LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8");
    while (iterator.hasNext()) {
        // ignore commented url (line starts with #)
        String line = iterator.nextLine();
        if (!line.startsWith("#") && !line.trim().isEmpty()) {
            urls.add(line.trim());
        }
    }

    // download all
    crawlPages(urls, crawledPagesFolder);

    List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false));
    Collections.sort(files, new Comparator<File>() {
        @Override
        public int compare(File o1, File o2) {
            return o1.getName().compareTo(o2.getName());
        }
    });

    int idCounter = 0;

    for (File file : files) {
        NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper();
        NYTimesArticleExtractor extractor = new NYTimesArticleExtractor();

        String html = FileUtils.readFileToString(file, "utf-8");

        idCounter++;
        File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter));
        File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter));

        try {
            List<Comment> comments = commentsScraper.extractComments(html);
            Article article = extractor.extractArticle(html);

            saveArticleToText(article, outputFileArticle);
            System.out.println("Saved to " + outputFileArticle);

            saveCommentsToText(comments, outputFileComments, article);
            System.out.println("Saved to " + outputFileComments);
        } catch (IOException ex) {
            System.err.println(file.getName() + "\n" + ex.getMessage());
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.StatisticsTableCreator.java

public static Table<String, String, Long> loadTable(InputStream stream) throws IOException {
    Table<String, String, Long> result = TreeBasedTable.create();

    LineIterator lineIterator = IOUtils.lineIterator(stream, "utf-8");
    while (lineIterator.hasNext()) {
        String line = lineIterator.next();

        System.out.println(line);

        String[] split = line.split("\t");
        String language = split[0];
        String license = split[1];
        Long documents = Long.valueOf(split[2]);
        Long tokens = Long.valueOf(split[3]);

        result.put(language, "docs " + license, documents);
        result.put(language, "tokens " + license, tokens);
    }//from w  ww .  j a v  a 2  s  .co  m

    return result;
}

From source file:de.rnd7.kata.reversi.logic.ai.AIMatrix.java

public static AIMatrix fromResource(final String name) throws IOException {
    final AIMatrix matrix = new AIMatrix();

    try (InputStream input = AIMatrix.class.getResourceAsStream(name)) {
        final LineIterator iterator = IOUtils.lineIterator(input, CharEncoding.UTF_8);

        int lineNumber = 0;
        while (iterator.hasNext()) {
            processLine(matrix, lineNumber++, iterator.next());
        }/*w  w  w  .  j  a v  a2 s.  c  o  m*/
    }

    return matrix;
}

From source file:com.icantrap.collections.dawg.TrieValidationTest.java

@Before
public void before() throws IOException {
    assumeThat(System.getProperty("RUN_VALIDATION"), is("on"));
    LineIterator iter = IOUtils.lineIterator(getClass().getResourceAsStream("/TWL06.txt"), null);
    dawgBuilder = new DawgBuilder();

    while (iter.hasNext())
        dawgBuilder.add(iter.next());//from  w w w. ja v a  2 s  .  c o m

    LineIterator.closeQuietly(iter);

    System.out.println("Uncompressed:  " + dawgBuilder.nodeCount() + " nodes");

    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    dawgBuilder.build();
    stopWatch.stop();

    System.out.println("Time to compress:  " + stopWatch.getTime() + " ms.");
    System.out.println("Compressed:  " + dawgBuilder.nodeCount() + " nodes");
}

From source file:com.icantrap.collections.dawg.DawgValidationTest.java

@Test
public void containsAllWords() throws IOException {
    LineIterator iter = IOUtils.lineIterator(getClass().getResourceAsStream("/TWL06.txt"), null);

    StopWatch stopWatch = new StopWatch();
    stopWatch.start();//from   w w  w  .j  a  va 2  s . co  m

    while (iter.hasNext()) {
        String word = iter.next();
        assertTrue("Missing word (" + word + ")", dawg.contains(word));
    }

    stopWatch.stop();
    System.out.println("Time to query:  " + stopWatch.getTime() + " ms.");

    LineIterator.closeQuietly(iter);
}

From source file:com.icantrap.collections.dawg.TrieValidationTest.java

@Test
public void containsAllWords() throws IOException {
    LineIterator iter = IOUtils.lineIterator(getClass().getResourceAsStream("/TWL06.txt"), null);

    StopWatch stopWatch = new StopWatch();
    stopWatch.start();/*from w w  w  .j  a v a 2 s . c om*/

    while (iter.hasNext()) {
        String word = iter.next();
        assertTrue("Missing word (" + word + ")", dawgBuilder.contains(word));
    }

    stopWatch.stop();
    System.out.println("Time to query:  " + stopWatch.getTime() + " ms.");

    LineIterator.closeQuietly(iter);
}

From source file:net.pms.io.OutputTextLogger.java

public void run() {
    LineIterator it = null;//from w  ww . ja  v a2 s.co  m

    try {
        it = IOUtils.lineIterator(inputStream, "UTF-8");

        while (it.hasNext()) {
            String line = it.nextLine();
            logger.debug(line);
        }
    } catch (IOException ioe) {
        logger.debug("Error consuming input stream: {}", ioe.getMessage());
    } catch (IllegalStateException ise) {
        logger.debug("Error reading from closed input stream: {}", ise.getMessage());
    } finally {
        LineIterator.closeQuietly(it); // clean up all associated resources
    }
}

From source file:net.orpiske.sfs.filter.dictionary.spell.DefaultDictionary.java

public DefaultDictionary() {
    InputStream stream = getClass().getResourceAsStream("/dictionaries/pt/port-big.dic");

    Iterator<String> i = null;

    try {/*from   www  . jav  a  2  s . c  o m*/
        i = IOUtils.lineIterator(stream, Charset.defaultCharset());

        while (i.hasNext()) {
            String line = i.next();

            if (line.startsWith("#")) {
                continue;
            }

            DictionaryEntry entry = DictionaryEntry.fromString(line);
            if (entry == null) {
                continue;
            }

            if (entry.getCategory() == DictionaryEntry.Category.OTHER) {
                continue;
            }

            if (logger.isTraceEnabled()) {
                logger.trace("Adding entry " + entry.getWord() + " to the cache");
            }

            hashSet.add(entry);
        }
    } catch (IOException e) {
        /*
         * We all know this is not really true, but this should never
         * actually happen. If it does, we're screwed o.O
         */
        logger.error("Unhandled I/O exception: " + e.getMessage(), e);

        throw new DictionaryReadException(e);
    } finally {
        IOUtils.closeQuietly(stream);
    }
}

From source file:modelinspector.collectors.WordlistMatchCollector.java

public WordlistMatchCollector(String aName, String aLanguage, boolean aCaseSensitive, int aCutoff, String aFile,
        String aEncoding) {//  w w  w.  j a v  a2s  .  c o m
    name = aName;
    baseVocabulary = new HashSet<>();
    caseSensitive = aCaseSensitive;
    language = new Locale(aLanguage);
    cutoff = aCutoff;

    try (InputStream is = new FileInputStream(aFile)) {
        LineIterator i = IOUtils.lineIterator(is, aEncoding);
        while (i.hasNext()) {
            String[] fields = i.nextLine().split("\t");
            if (fields.length > 1 && aCutoff > 0) {
                if (Integer.valueOf(fields[1]) < aCutoff) {
                    continue;
                }
            }
            String word = aCaseSensitive ? fields[0] : fields[0].toLowerCase(language);
            baseVocabulary.add(word);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    originalBaseVocabularySize = baseVocabulary.size();
}

From source file:gobblin.source.extractor.filebased.SingleFileDownloader.java

@SuppressWarnings("unchecked")
public Iterator<D> downloadFile(String file) throws IOException {

    log.info("Beginning to download file: " + file);

    try {/*  w w w.  j a v a  2  s .  c  o m*/
        InputStream inputStream = this.fileBasedExtractor.getCloser()
                .register(this.fileBasedExtractor.getFsHelper().getFileStream(file));
        Iterator<D> fileItr = (Iterator<D>) IOUtils.lineIterator(inputStream,
                ConfigurationKeys.DEFAULT_CHARSET_ENCODING);
        if (this.fileBasedExtractor.isShouldSkipFirstRecord() && fileItr.hasNext()) {
            fileItr.next();
        }
        return fileItr;
    } catch (FileBasedHelperException e) {
        throw new IOException("Exception while downloading file " + file + " with message " + e.getMessage(),
                e);
    }
}