List of usage examples for org.apache.commons.io LineIterator nextLine
public String nextLine()
Reader
. From source file:de.tu.darmstadt.lt.ner.preprocessing.SentenceToCRFWriter.java
public static void main(String[] args) throws UIMAException, IllegalArgumentException, IOException { LineIterator sentIt = FileUtils.lineIterator(new File(args[0]), "UTF-8"); List<String> sentences = new ArrayList<String>(); StringBuilder sb = new StringBuilder(); int index = 0; while (sentIt.hasNext()) { String line = sentIt.nextLine().toString().trim().split("\t")[1]; if (line.equals("")) { continue; }/*www.java 2 s . c o m*/ sentences.add(line); } GermaNERMain.sentenceToCRFFormat(sentences, args[1], "de"); }
From source file:eu.annocultor.converters.geonames.GeonamesDumpToRdf.java
public static void main(String[] args) throws Exception { File root = new File("input_source"); // load country-continent match countryToContinent/*from w w w .j av a 2s .c om*/ .load((new GeonamesDumpToRdf()).getClass().getResourceAsStream("/country-to-continent.properties")); // creating files Map<String, BufferedWriter> files = new HashMap<String, BufferedWriter>(); Map<String, Boolean> started = new HashMap<String, Boolean>(); for (Object string : countryToContinent.keySet()) { String continent = countryToContinent.getProperty(string.toString()); File dir = new File(root, continent); if (!dir.exists()) { dir.mkdir(); } files.put(string.toString(), new BufferedWriter(new OutputStreamWriter( new FileOutputStream(new File(root, continent + "/" + string + ".rdf")), "UTF-8"))); System.out.println(continent + "/" + string + ".rdf"); started.put(string.toString(), false); } System.out.println(started); Pattern countryPattern = Pattern .compile("<inCountry rdf\\:resource\\=\"http\\://www\\.geonames\\.org/countries/\\#(\\w\\w)\"/>"); long counter = 0; LineIterator it = FileUtils.lineIterator(new File(root, "all-geonames-rdf.txt"), "UTF-8"); try { while (it.hasNext()) { String text = it.nextLine(); if (text.startsWith("http://sws.geonames")) continue; // progress counter++; if (counter % 100000 == 0) { System.out.print("*"); } // System.out.println(counter); // get country String country = null; Matcher matcher = countryPattern.matcher(text); if (matcher.find()) { country = matcher.group(1); } // System.out.println(country); if (country == null) country = "null"; text = text.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><rdf:RDF", "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?><rdf:RDF"); if (started.get(country) == null) throw new Exception("Unknow country " + country); if (started.get(country).booleanValue()) { // remove RDF opening text = text.substring(text.indexOf("<rdf:RDF ")); text = text.substring(text.indexOf(">") + 1); } // remove RDF ending text = text.substring(0, text.indexOf("</rdf:RDF>")); files.get(country).append(text + "\n"); if (!started.get(country).booleanValue()) { // System.out.println("Started with country " + country); } started.put(country, true); } } finally { LineIterator.closeQuietly(it); } for (Object string : countryToContinent.keySet()) { boolean hasStarted = started.get(string.toString()).booleanValue(); if (hasStarted) { BufferedWriter bf = files.get(string.toString()); bf.append("</rdf:RDF>"); bf.flush(); bf.close(); } } return; }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java
public static void main(String[] args) throws Exception { File crawledPagesFolder = new File(args[0]); if (!crawledPagesFolder.exists()) { crawledPagesFolder.mkdirs();//from www. j a va 2 s . c om } File outputFolder = new File(args[1]); if (!outputFolder.exists()) { outputFolder.mkdirs(); } // read links from text file final String urlsResourceName = "roomfordebate-urls.txt"; InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName); if (urlsStream == null) { throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath"); } // read list of urls List<String> urls = new ArrayList<>(); LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8"); while (iterator.hasNext()) { // ignore commented url (line starts with #) String line = iterator.nextLine(); if (!line.startsWith("#") && !line.trim().isEmpty()) { urls.add(line.trim()); } } // download all crawlPages(urls, crawledPagesFolder); List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false)); Collections.sort(files, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); int idCounter = 0; for (File file : files) { NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper(); NYTimesArticleExtractor extractor = new NYTimesArticleExtractor(); String html = FileUtils.readFileToString(file, "utf-8"); idCounter++; File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter)); File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter)); try { List<Comment> comments = commentsScraper.extractComments(html); Article article = extractor.extractArticle(html); saveArticleToText(article, outputFileArticle); System.out.println("Saved to " + outputFileArticle); saveCommentsToText(comments, outputFileComments, article); System.out.println("Saved to " + outputFileComments); } catch (IOException ex) { System.err.println(file.getName() + "\n" + ex.getMessage()); } } }
From source file:data_gen.Data_gen.java
public static void main(String[] args) throws FileNotFoundException, IOException { long startTime = System.nanoTime(); if (args.length < 2) { System.out.println("Usage:"); System.out.println(//from ww w . j ava 2 s . com "java -jar \"jarfile\" [Directory of text source folder] [Dierctory of configration file]" + "\n"); System.exit(0); } String Dir = args[0]; // get text source dir from user String config_dir = args[1]; File folder = new File(Dir); if (folder.isDirectory() == false) { System.out.println("Text souce folder is not a Directory." + "\n"); System.exit(0); } if (!config_dir.endsWith(".properties") && !config_dir.endsWith(".PROPERTIES")) { System.out.println("\n" + "There was error parsing dataset parameters from configuration file, make sure you have the 4 parameters specified and the right type of file" + "\n"); System.exit(0); } listOfFiles = folder.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".txt"); } }); if (listOfFiles.length == 0) { System.out.println("Text source folder is empty ! Have at least one .txt file there" + "\n"); System.exit(0); } System.out.println("\n"); Parse_Document_values(config_dir);// parse config file to get class attribute values document_size = Docments_Total_size / documents_count; // to get each document size max = (long) ((double) document_size * 1.8); min = (long) ((double) document_size * 0.2); schema_fields = Parse_Document_fields(config_dir); try { LineIterator it = FileUtils.lineIterator(listOfFiles[0]); while (it.hasNext()) { tx.add(it.nextLine()); } } catch (NullPointerException | FileNotFoundException e) { System.out.println("The text source file could not be found." + "\n"); System.exit(0); } new File(output_dir).mkdir(); //////////////////////////////////////////////////////////////// build json or .dat //////////////////////////////////////////////////////////////////// if (Default_DataSet_name.endsWith(".json")) { Build_json_file(config_dir, startTime); } if (Default_DataSet_name.endsWith(".dat")) { Build_dat_file(config_dir, startTime); } generate_xml(); generate_field_map(); }
From source file:com.cirro.jsonjoin.utils.FileManager.java
public static <T extends Row> List<T> loadFile(File file, Class<T> valueType) throws IOException { List rowList = new ArrayList(); LineIterator it = FileUtils.lineIterator(file, "UTF-8"); while (it.hasNext()) { String line = it.nextLine(); Row row = convertToRow(line, valueType); rowList.add(row);/*ww w.j a va 2 s . co m*/ } return rowList; }
From source file:com.cirro.jsonjoin.utils.FileManager.java
public static <T extends Row> Stream<T> loadFileStream(File file, Class<T> valueType) throws IOException { List rowList = new ArrayList(); LineIterator it = FileUtils.lineIterator(file, "UTF-8"); while (it.hasNext()) { String line = it.nextLine(); Row row = convertToRow(line, valueType); rowList.add(row);/*from w ww .j a v a 2s. c om*/ } return rowList.stream(); }
From source file:au.org.ala.names.util.FileUtils.java
public static Set<String> streamToSet(InputStream source, Set<String> resultSet, boolean toLowerCase) throws IOException { LineIterator lines = getLineIterator(source, "UTF8"); while (lines.hasNext()) { String line = lines.nextLine().trim(); if (toLowerCase) line = line.toLowerCase();// www. j a va 2s . c o m // ignore comments if (!ignore(line)) { resultSet.add(line); } } return resultSet; }
From source file:net.femtoparsec.jwhois.JWhoIsTest.java
private static void dumpAsText(byte[] bytes) { LineIterator iterator = IOUtils.lineIterator(new InputStreamReader(new ByteArrayInputStream(bytes))); while (iterator.hasNext()) { System.out.println(iterator.nextLine()); }/*from w ww. j a va 2s .com*/ }
From source file:de.tudarmstadt.lt.nlkg.EvaluatePreds.java
static void evaluate(String file) throws IllegalArgumentException, FileNotFoundException { DT dt = new DT() { {/*from ww w . ja v a 2 s.c o m*/ _mysql_dbname = "nlkg_1"; } }; LineIterator iter = new LineIterator(new FileReader(file)); iter.nextLine(); // skip first line int lineno = 1; double tp = 0d, tn = 0d, fp = 0d, fn = 0d; while (iter.hasNext() && (lineno < 100 || true)) { lineno++; String line = iter.nextLine(); if (line.trim().isEmpty()) continue; String[] splits = line.split("\t"); String x = splits[0].trim(); String y = splits[1].trim(); String pred_l = splits[2].trim(); String pred_r = splits[3].trim(); boolean entailing_trueclass = Boolean.valueOf(splits[4].trim()); _X.add(x); _Y.add(y); _PRED_L.add(pred_l); _PRED_R.add(pred_r); _ENTAILING.add(entailing_trueclass); boolean entailing_predicted = predictEntailing(dt, pred_l, pred_r); if (lineno % 100 == 0) Evaluate.log_progress(); if (entailing_predicted && entailing_trueclass) { Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tp", pred_l, pred_r, entailing_trueclass)); tp++; } if (!entailing_predicted && !entailing_trueclass) { Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tn", pred_l, pred_r, entailing_trueclass)); tn++; } if (entailing_predicted && !entailing_trueclass) { Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fp", pred_l, pred_r, entailing_trueclass)); fp++; } if (!entailing_predicted && entailing_trueclass) { Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fn", pred_l, pred_r, entailing_trueclass)); fn++; } } System.out.format("tp: %d; fp: %d; fn: %d; tn: %d; %n", (int) tp, (int) fp, (int) fn, (int) tn); System.out.println("Precision = " + (tp / (tp + fp))); System.out.println("Recall = " + (tp / (tp + fn))); System.out.println("F1 = " + ((2 * tp) / ((2 * tp) + fn + fp))); dt.disconnect(); }
From source file:de.tudarmstadt.lt.nlkg.EvaluateArgs.java
static void evaluate(String file) throws IllegalArgumentException, FileNotFoundException { DT dt = new DT() { {// w w w .ja v a 2s . c o m _mysql_host = "localhost"; _mysql_dbname = "nlkg_1"; } }; LineIterator iter = new LineIterator(new FileReader(file)); iter.nextLine(); // skip first line int lineno = 1; double tp = 0d, tn = 0d, fp = 0d, fn = 0d; while (iter.hasNext() && (lineno < 100 || true)) { lineno++; String line = iter.nextLine(); if (line.trim().isEmpty()) continue; String[] splits = line.split("\t"); // String context = splits[0].trim(); String arg_l = splits[0].trim(); String arg_r = splits[1].trim(); boolean entailing_trueclass = Boolean.valueOf(splits[2].trim()); // _CONTEXT.add(context); _ARG_L.add(arg_l); _ARG_R.add(arg_r); _ENTAILING.add(entailing_trueclass); boolean entailing_predicted = predictEntailing(dt, arg_l, arg_r); if (lineno % 100 == 0) Evaluate.log_progress(); if (entailing_predicted && entailing_trueclass) { Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tp", arg_l, arg_r, entailing_trueclass)); tp++; } if (!entailing_predicted && !entailing_trueclass) { Evaluate.log_true(String.format("%d %-10s %-30s %-30s %b %n", lineno, "tn", arg_l, arg_r, entailing_trueclass)); tn++; } if (entailing_predicted && !entailing_trueclass) { Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fp", arg_l, arg_r, entailing_trueclass)); fp++; } if (!entailing_predicted && entailing_trueclass) { Evaluate.log_false(String.format("%d %-10s %-30s %-30s %b %n", lineno, "fn", arg_l, arg_r, entailing_trueclass)); fn++; } } dt.disconnect(); System.out.format("%ntp: %d; fp: %d; fn: %d; tn: %d; %n", (int) tp, (int) fp, (int) fn, (int) tn); System.out.println("Precision = " + (tp / (tp + fp))); System.out.println("Recall = " + (tp / (tp + fn))); System.out.println("F1 = " + ((2 * tp) / ((2 * tp) + fn + fp))); }