Example usage for org.apache.commons.io LineIterator nextLine

List of usage examples for org.apache.commons.io LineIterator nextLine

Introduction

In this page you can find the example usage for org.apache.commons.io LineIterator nextLine.

Prototype

public String nextLine() 

Source Link

Document

Returns the next line in the wrapped Reader.

Usage

From source file:de.tu.darmstadt.lt.ner.preprocessing.SentenceToCRFWriter.java

public static void main(String[] args) throws UIMAException, IllegalArgumentException, IOException {
    LineIterator sentIt = FileUtils.lineIterator(new File(args[0]), "UTF-8");
    List<String> sentences = new ArrayList<String>();
    StringBuilder sb = new StringBuilder();
    int index = 0;
    while (sentIt.hasNext()) {
        String line = sentIt.nextLine().toString().trim().split("\t")[1];
        if (line.equals("")) {
            continue;
        }/*www.java  2 s . c o m*/
        sentences.add(line);
    }
    GermaNERMain.sentenceToCRFFormat(sentences, args[1], "de");
}

From source file:eu.annocultor.converters.geonames.GeonamesDumpToRdf.java

public static void main(String[] args) throws Exception {
    File root = new File("input_source");

    // load country-continent match
    countryToContinent/*from w w w  .j av  a 2s  .c om*/
            .load((new GeonamesDumpToRdf()).getClass().getResourceAsStream("/country-to-continent.properties"));

    // creating files
    Map<String, BufferedWriter> files = new HashMap<String, BufferedWriter>();
    Map<String, Boolean> started = new HashMap<String, Boolean>();

    for (Object string : countryToContinent.keySet()) {
        String continent = countryToContinent.getProperty(string.toString());
        File dir = new File(root, continent);
        if (!dir.exists()) {
            dir.mkdir();
        }
        files.put(string.toString(), new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(new File(root, continent + "/" + string + ".rdf")), "UTF-8")));
        System.out.println(continent + "/" + string + ".rdf");
        started.put(string.toString(), false);
    }

    System.out.println(started);

    Pattern countryPattern = Pattern
            .compile("<inCountry rdf\\:resource\\=\"http\\://www\\.geonames\\.org/countries/\\#(\\w\\w)\"/>");
    long counter = 0;
    LineIterator it = FileUtils.lineIterator(new File(root, "all-geonames-rdf.txt"), "UTF-8");
    try {
        while (it.hasNext()) {
            String text = it.nextLine();
            if (text.startsWith("http://sws.geonames"))
                continue;

            // progress
            counter++;
            if (counter % 100000 == 0) {
                System.out.print("*");
            }
            //         System.out.println(counter);
            // get country
            String country = null;
            Matcher matcher = countryPattern.matcher(text);
            if (matcher.find()) {
                country = matcher.group(1);
            }
            //         System.out.println(country);
            if (country == null)
                country = "null";
            text = text.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><rdf:RDF",
                    "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?><rdf:RDF");
            if (started.get(country) == null)
                throw new Exception("Unknow country " + country);
            if (started.get(country).booleanValue()) {
                // remove RDF opening
                text = text.substring(text.indexOf("<rdf:RDF "));
                text = text.substring(text.indexOf(">") + 1);
            }
            // remove RDF ending
            text = text.substring(0, text.indexOf("</rdf:RDF>"));
            files.get(country).append(text + "\n");
            if (!started.get(country).booleanValue()) {
                // System.out.println("Started with country " + country);
            }
            started.put(country, true);
        }
    } finally {
        LineIterator.closeQuietly(it);
    }

    for (Object string : countryToContinent.keySet()) {
        boolean hasStarted = started.get(string.toString()).booleanValue();
        if (hasStarted) {
            BufferedWriter bf = files.get(string.toString());
            bf.append("</rdf:RDF>");
            bf.flush();
            bf.close();
        }
    }
    return;
}

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java

public static void main(String[] args) throws Exception {
    File crawledPagesFolder = new File(args[0]);
    if (!crawledPagesFolder.exists()) {
        crawledPagesFolder.mkdirs();//from  www. j  a va 2  s . c  om
    }

    File outputFolder = new File(args[1]);
    if (!outputFolder.exists()) {
        outputFolder.mkdirs();
    }

    // read links from text file
    final String urlsResourceName = "roomfordebate-urls.txt";

    InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName);

    if (urlsStream == null) {
        throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath");
    }

    // read list of urls
    List<String> urls = new ArrayList<>();
    LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8");
    while (iterator.hasNext()) {
        // ignore commented url (line starts with #)
        String line = iterator.nextLine();
        if (!line.startsWith("#") && !line.trim().isEmpty()) {
            urls.add(line.trim());
        }
    }

    // download all
    crawlPages(urls, crawledPagesFolder);

    List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false));
    Collections.sort(files, new Comparator<File>() {
        @Override
        public int compare(File o1, File o2) {
            return o1.getName().compareTo(o2.getName());
        }
    });

    int idCounter = 0;

    for (File file : files) {
        NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper();
        NYTimesArticleExtractor extractor = new NYTimesArticleExtractor();

        String html = FileUtils.readFileToString(file, "utf-8");

        idCounter++;
        File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter));
        File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter));

        try {
            List<Comment> comments = commentsScraper.extractComments(html);
            Article article = extractor.extractArticle(html);

            saveArticleToText(article, outputFileArticle);
            System.out.println("Saved to " + outputFileArticle);

            saveCommentsToText(comments, outputFileComments, article);
            System.out.println("Saved to " + outputFileComments);
        } catch (IOException ex) {
            System.err.println(file.getName() + "\n" + ex.getMessage());
        }
    }
}

From source file:data_gen.Data_gen.java

public static void main(String[] args) throws FileNotFoundException, IOException {
    long startTime = System.nanoTime();
    if (args.length < 2) {
        System.out.println("Usage:");
        System.out.println(//from ww w . j ava 2 s  . com
                "java -jar \"jarfile\" [Directory of text source folder] [Dierctory of configration file]"
                        + "\n");
        System.exit(0);
    }

    String Dir = args[0]; // get text source dir from user
    String config_dir = args[1];
    File folder = new File(Dir);
    if (folder.isDirectory() == false) {
        System.out.println("Text souce folder is not a Directory." + "\n");
        System.exit(0);
    }
    if (!config_dir.endsWith(".properties") && !config_dir.endsWith(".PROPERTIES")) {
        System.out.println("\n"
                + "There was error parsing dataset parameters from configuration file, make sure you have the 4 parameters specified and the right type of file"
                + "\n");
        System.exit(0);
    }

    listOfFiles = folder.listFiles(new FilenameFilter() {
        @Override
        public boolean accept(File dir, String name) {
            return name.toLowerCase().endsWith(".txt");
        }
    });

    if (listOfFiles.length == 0) {
        System.out.println("Text source folder is empty ! Have at least one .txt file there" + "\n");
        System.exit(0);
    }

    System.out.println("\n");
    Parse_Document_values(config_dir);// parse config file to get class attribute values
    document_size = Docments_Total_size / documents_count; // to get each document size 
    max = (long) ((double) document_size * 1.8);
    min = (long) ((double) document_size * 0.2);

    schema_fields = Parse_Document_fields(config_dir);

    try {
        LineIterator it = FileUtils.lineIterator(listOfFiles[0]);

        while (it.hasNext()) {
            tx.add(it.nextLine());
        }
    } catch (NullPointerException | FileNotFoundException e) {
        System.out.println("The text source file could not be found." + "\n");
        System.exit(0);
    }

    new File(output_dir).mkdir();
    //////////////////////////////////////////////////////////////// build json or .dat
    ////////////////////////////////////////////////////////////////////     
    if (Default_DataSet_name.endsWith(".json")) {
        Build_json_file(config_dir, startTime);
    }

    if (Default_DataSet_name.endsWith(".dat")) {
        Build_dat_file(config_dir, startTime);
    }

    generate_xml();
    generate_field_map();

}

From source file:com.cirro.jsonjoin.utils.FileManager.java

public static <T extends Row> List<T> loadFile(File file, Class<T> valueType) throws IOException {
    List rowList = new ArrayList();
    LineIterator it = FileUtils.lineIterator(file, "UTF-8");
    while (it.hasNext()) {
        String line = it.nextLine();
        Row row = convertToRow(line, valueType);
        rowList.add(row);/*ww  w.j  a  va 2 s  .  co  m*/
    }
    return rowList;
}

From source file:com.cirro.jsonjoin.utils.FileManager.java

public static <T extends Row> Stream<T> loadFileStream(File file, Class<T> valueType) throws IOException {
    List rowList = new ArrayList();
    LineIterator it = FileUtils.lineIterator(file, "UTF-8");
    while (it.hasNext()) {
        String line = it.nextLine();
        Row row = convertToRow(line, valueType);
        rowList.add(row);/*from   w ww .j  a  v a 2s.  c  om*/
    }
    return rowList.stream();
}

From source file:au.org.ala.names.util.FileUtils.java

public static Set<String> streamToSet(InputStream source, Set<String> resultSet, boolean toLowerCase)
        throws IOException {
    LineIterator lines = getLineIterator(source, "UTF8");
    while (lines.hasNext()) {
        String line = lines.nextLine().trim();
        if (toLowerCase)
            line = line.toLowerCase();//  www. j  a va  2s . c o  m
        // ignore comments
        if (!ignore(line)) {
            resultSet.add(line);
        }
    }
    return resultSet;
}

From source file:net.femtoparsec.jwhois.JWhoIsTest.java

private static void dumpAsText(byte[] bytes) {
    LineIterator iterator = IOUtils.lineIterator(new InputStreamReader(new ByteArrayInputStream(bytes)));
    while (iterator.hasNext()) {
        System.out.println(iterator.nextLine());
    }/*from  w ww.  j a  va 2s .com*/
}

From source file:de.tudarmstadt.lt.nlkg.EvaluatePreds.java

static void evaluate(String file) throws IllegalArgumentException, FileNotFoundException {
    DT dt = new DT() {
        {/*from   ww  w .  ja v  a 2  s.c o  m*/
            _mysql_dbname = "nlkg_1";
        }
    };
    LineIterator iter = new LineIterator(new FileReader(file));
    iter.nextLine(); // skip first line
    int lineno = 1;
    double tp = 0d, tn = 0d, fp = 0d, fn = 0d;
    while (iter.hasNext() && (lineno < 100 || true)) {
        lineno++;
        String line = iter.nextLine();
        if (line.trim().isEmpty())
            continue;

        String[] splits = line.split("\t");
        String x = splits[0].trim();
        String y = splits[1].trim();
        String pred_l = splits[2].trim();
        String pred_r = splits[3].trim();
        boolean entailing_trueclass = Boolean.valueOf(splits[4].trim());

        _X.add(x);
        _Y.add(y);
        _PRED_L.add(pred_l);
        _PRED_R.add(pred_r);
        _ENTAILING.add(entailing_trueclass);

        boolean entailing_predicted = predictEntailing(dt, pred_l, pred_r);
        if (lineno % 100 == 0)
            Evaluate.log_progress();

        if (entailing_predicted && entailing_trueclass) {
            Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tp", pred_l, pred_r,
                    entailing_trueclass));
            tp++;
        }
        if (!entailing_predicted && !entailing_trueclass) {
            Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tn", pred_l, pred_r,
                    entailing_trueclass));
            tn++;
        }
        if (entailing_predicted && !entailing_trueclass) {
            Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fp", pred_l, pred_r,
                    entailing_trueclass));
            fp++;
        }
        if (!entailing_predicted && entailing_trueclass) {
            Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fn", pred_l, pred_r,
                    entailing_trueclass));
            fn++;
        }

    }

    System.out.format("tp: %d; fp: %d; fn: %d; tn: %d; %n", (int) tp, (int) fp, (int) fn, (int) tn);
    System.out.println("Precision = " + (tp / (tp + fp)));
    System.out.println("Recall    = " + (tp / (tp + fn)));
    System.out.println("F1        = " + ((2 * tp) / ((2 * tp) + fn + fp)));

    dt.disconnect();

}

From source file:de.tudarmstadt.lt.nlkg.EvaluateArgs.java

static void evaluate(String file) throws IllegalArgumentException, FileNotFoundException {
    DT dt = new DT() {
        {// w w w .ja v  a  2s  . c o m
            _mysql_host = "localhost";
            _mysql_dbname = "nlkg_1";
        }
    };
    LineIterator iter = new LineIterator(new FileReader(file));
    iter.nextLine(); // skip first line
    int lineno = 1;
    double tp = 0d, tn = 0d, fp = 0d, fn = 0d;
    while (iter.hasNext() && (lineno < 100 || true)) {
        lineno++;
        String line = iter.nextLine();
        if (line.trim().isEmpty())
            continue;

        String[] splits = line.split("\t");
        //         String context = splits[0].trim();
        String arg_l = splits[0].trim();
        String arg_r = splits[1].trim();
        boolean entailing_trueclass = Boolean.valueOf(splits[2].trim());

        //         _CONTEXT.add(context);
        _ARG_L.add(arg_l);
        _ARG_R.add(arg_r);
        _ENTAILING.add(entailing_trueclass);

        boolean entailing_predicted = predictEntailing(dt, arg_l, arg_r);
        if (lineno % 100 == 0)
            Evaluate.log_progress();

        if (entailing_predicted && entailing_trueclass) {
            Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tp", arg_l, arg_r,
                    entailing_trueclass));
            tp++;
        }
        if (!entailing_predicted && !entailing_trueclass) {
            Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tn", arg_l, arg_r,
                    entailing_trueclass));
            tn++;
        }
        if (entailing_predicted && !entailing_trueclass) {
            Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fp", arg_l, arg_r,
                    entailing_trueclass));
            fp++;
        }
        if (!entailing_predicted && entailing_trueclass) {
            Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fn", arg_l, arg_r,
                    entailing_trueclass));
            fn++;
        }

    }

    dt.disconnect();

    System.out.format("%ntp: %d; fp: %d; fn: %d; tn: %d; %n", (int) tp, (int) fp, (int) fn, (int) tn);
    System.out.println("Precision = " + (tp / (tp + fp)));
    System.out.println("Recall    = " + (tp / (tp + fn)));
    System.out.println("F1        = " + ((2 * tp) / ((2 * tp) + fn + fp)));
}