List of usage examples for org.apache.commons.io LineIterator hasNext
public boolean hasNext()
Reader
has more lines. From source file:de.tu.darmstadt.lt.ner.preprocessing.SentenceToCRFWriter.java
public static void main(String[] args) throws UIMAException, IllegalArgumentException, IOException { LineIterator sentIt = FileUtils.lineIterator(new File(args[0]), "UTF-8"); List<String> sentences = new ArrayList<String>(); StringBuilder sb = new StringBuilder(); int index = 0; while (sentIt.hasNext()) { String line = sentIt.nextLine().toString().trim().split("\t")[1]; if (line.equals("")) { continue; }// www .ja v a 2s .c om sentences.add(line); } GermaNERMain.sentenceToCRFFormat(sentences, args[1], "de"); }
From source file:com.senseidb.search.node.inmemory.InMemoryIndexPerfEval.java
public static void main(String[] args) throws Exception { final InMemorySenseiService memorySenseiService = InMemorySenseiService.valueOf( new File(InMemoryIndexPerfEval.class.getClassLoader().getResource("test-conf/node1/").toURI())); final List<JSONObject> docs = new ArrayList<JSONObject>(15000); LineIterator lineIterator = FileUtils.lineIterator( new File(InMemoryIndexPerfEval.class.getClassLoader().getResource("data/test_data.json").toURI())); int i = 0;/*from w w w . j a v a 2 s . co m*/ while (lineIterator.hasNext() && i < 100) { String car = lineIterator.next(); if (car != null && car.contains("{")) docs.add(new JSONObject(car)); i++; } Thread[] threads = new Thread[10]; for (int k = 0; k < threads.length; k++) { threads[k] = new Thread(new Runnable() { public void run() { long time = System.currentTimeMillis(); //System.out.println("Start thread"); for (int j = 0; j < 1000; j++) { //System.out.println("Send request"); memorySenseiService.doQuery(getRequest(), docs); } System.out.println("time = " + (System.currentTimeMillis() - time)); } }); threads[k].start(); } Thread.sleep(500000); }
From source file:eu.annocultor.converters.geonames.GeonamesDumpToRdf.java
public static void main(String[] args) throws Exception { File root = new File("input_source"); // load country-continent match countryToContinent/*from ww w. j a v a 2 s . c om*/ .load((new GeonamesDumpToRdf()).getClass().getResourceAsStream("/country-to-continent.properties")); // creating files Map<String, BufferedWriter> files = new HashMap<String, BufferedWriter>(); Map<String, Boolean> started = new HashMap<String, Boolean>(); for (Object string : countryToContinent.keySet()) { String continent = countryToContinent.getProperty(string.toString()); File dir = new File(root, continent); if (!dir.exists()) { dir.mkdir(); } files.put(string.toString(), new BufferedWriter(new OutputStreamWriter( new FileOutputStream(new File(root, continent + "/" + string + ".rdf")), "UTF-8"))); System.out.println(continent + "/" + string + ".rdf"); started.put(string.toString(), false); } System.out.println(started); Pattern countryPattern = Pattern .compile("<inCountry rdf\\:resource\\=\"http\\://www\\.geonames\\.org/countries/\\#(\\w\\w)\"/>"); long counter = 0; LineIterator it = FileUtils.lineIterator(new File(root, "all-geonames-rdf.txt"), "UTF-8"); try { while (it.hasNext()) { String text = it.nextLine(); if (text.startsWith("http://sws.geonames")) continue; // progress counter++; if (counter % 100000 == 0) { System.out.print("*"); } // System.out.println(counter); // get country String country = null; Matcher matcher = countryPattern.matcher(text); if (matcher.find()) { country = matcher.group(1); } // System.out.println(country); if (country == null) country = "null"; text = text.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?><rdf:RDF", "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?><rdf:RDF"); if (started.get(country) == null) throw new Exception("Unknow country " + country); if (started.get(country).booleanValue()) { // remove RDF opening text = text.substring(text.indexOf("<rdf:RDF ")); text = text.substring(text.indexOf(">") + 1); } // remove RDF ending text = text.substring(0, text.indexOf("</rdf:RDF>")); files.get(country).append(text + "\n"); if (!started.get(country).booleanValue()) { // System.out.println("Started with country " + country); } started.put(country, true); } } finally { LineIterator.closeQuietly(it); } for (Object string : countryToContinent.keySet()) { boolean hasStarted = started.get(string.toString()).booleanValue(); if (hasStarted) { BufferedWriter bf = files.get(string.toString()); bf.append("</rdf:RDF>"); bf.flush(); bf.close(); } } return; }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.DataFetcher.java
public static void main(String[] args) throws Exception { File crawledPagesFolder = new File(args[0]); if (!crawledPagesFolder.exists()) { crawledPagesFolder.mkdirs();/* w w w . java 2s. c o m*/ } File outputFolder = new File(args[1]); if (!outputFolder.exists()) { outputFolder.mkdirs(); } // read links from text file final String urlsResourceName = "roomfordebate-urls.txt"; InputStream urlsStream = DataFetcher.class.getClassLoader().getResourceAsStream(urlsResourceName); if (urlsStream == null) { throw new IOException("Cannot find resource " + urlsResourceName + " on the classpath"); } // read list of urls List<String> urls = new ArrayList<>(); LineIterator iterator = IOUtils.lineIterator(urlsStream, "utf-8"); while (iterator.hasNext()) { // ignore commented url (line starts with #) String line = iterator.nextLine(); if (!line.startsWith("#") && !line.trim().isEmpty()) { urls.add(line.trim()); } } // download all crawlPages(urls, crawledPagesFolder); List<File> files = new ArrayList<>(FileUtils.listFiles(crawledPagesFolder, null, false)); Collections.sort(files, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); int idCounter = 0; for (File file : files) { NYTimesCommentsScraper commentsScraper = new NYTimesCommentsScraper(); NYTimesArticleExtractor extractor = new NYTimesArticleExtractor(); String html = FileUtils.readFileToString(file, "utf-8"); idCounter++; File outputFileArticle = new File(outputFolder, String.format("Cx%03d.txt", idCounter)); File outputFileComments = new File(outputFolder, String.format("Dx%03d.txt", idCounter)); try { List<Comment> comments = commentsScraper.extractComments(html); Article article = extractor.extractArticle(html); saveArticleToText(article, outputFileArticle); System.out.println("Saved to " + outputFileArticle); saveCommentsToText(comments, outputFileComments, article); System.out.println("Saved to " + outputFileComments); } catch (IOException ex) { System.err.println(file.getName() + "\n" + ex.getMessage()); } } }
From source file:de.tum.i13.ConvertCsvToProtobuf.java
public static void main(String args[]) { try {//from w w w . j a v a2s . co m LineIterator it = FileUtils.lineIterator(new File("/Users/manit/Projects/sdcbenchmark/Dataset/debscsv"), "UTF-8"); FileOutputStream out = new FileOutputStream("/Users/manit/Projects/sdcbenchmark/Dataset/debsprotobuf", true); while (it.hasNext()) { String csvLine = (String) it.next(); byte[] csvLineBytes = csvLine.getBytes(); String line = new String(csvLineBytes, StandardCharsets.UTF_8); Debs2015Protos.Taxitrip.Builder builder = Debs2015Protos.Taxitrip.newBuilder(); String[] splitted = line.split(","); builder.setMedallion(splitted[0]); builder.setHackLicense(splitted[1]); builder.setPickupDatetime(splitted[2]); builder.setDropoffDatetime(splitted[3]); builder.setTripTimeInSecs(Integer.parseInt(splitted[4])); builder.setTripDistance(Float.parseFloat(splitted[5])); builder.setPickupLongitude(Float.parseFloat(splitted[6])); builder.setPickupLatitude(Float.parseFloat(splitted[7])); builder.setDropoffLongitude(Float.parseFloat(splitted[8])); builder.setDropoffLatitude(Float.parseFloat(splitted[9])); builder.setPaymentType(splitted[10]); builder.setFareAmount(Float.parseFloat(splitted[11])); builder.setSurcharge(Float.parseFloat(splitted[12])); builder.setMtaTax(Float.parseFloat(splitted[13])); builder.setTipAmount(Float.parseFloat(splitted[14])); builder.setTollsAmount(Float.parseFloat(splitted[15])); builder.setTotalAmount(Float.parseFloat(splitted[16])); builder.build().writeDelimitedTo(out); } out.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:data_gen.Data_gen.java
public static void main(String[] args) throws FileNotFoundException, IOException { long startTime = System.nanoTime(); if (args.length < 2) { System.out.println("Usage:"); System.out.println(// www .j av a 2 s. co m "java -jar \"jarfile\" [Directory of text source folder] [Dierctory of configration file]" + "\n"); System.exit(0); } String Dir = args[0]; // get text source dir from user String config_dir = args[1]; File folder = new File(Dir); if (folder.isDirectory() == false) { System.out.println("Text souce folder is not a Directory." + "\n"); System.exit(0); } if (!config_dir.endsWith(".properties") && !config_dir.endsWith(".PROPERTIES")) { System.out.println("\n" + "There was error parsing dataset parameters from configuration file, make sure you have the 4 parameters specified and the right type of file" + "\n"); System.exit(0); } listOfFiles = folder.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".txt"); } }); if (listOfFiles.length == 0) { System.out.println("Text source folder is empty ! Have at least one .txt file there" + "\n"); System.exit(0); } System.out.println("\n"); Parse_Document_values(config_dir);// parse config file to get class attribute values document_size = Docments_Total_size / documents_count; // to get each document size max = (long) ((double) document_size * 1.8); min = (long) ((double) document_size * 0.2); schema_fields = Parse_Document_fields(config_dir); try { LineIterator it = FileUtils.lineIterator(listOfFiles[0]); while (it.hasNext()) { tx.add(it.nextLine()); } } catch (NullPointerException | FileNotFoundException e) { System.out.println("The text source file could not be found." + "\n"); System.exit(0); } new File(output_dir).mkdir(); //////////////////////////////////////////////////////////////// build json or .dat //////////////////////////////////////////////////////////////////// if (Default_DataSet_name.endsWith(".json")) { Build_json_file(config_dir, startTime); } if (Default_DataSet_name.endsWith(".dat")) { Build_dat_file(config_dir, startTime); } generate_xml(); generate_field_map(); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.StatisticsTableCreator.java
public static Table<String, String, Long> loadTable(InputStream stream) throws IOException { Table<String, String, Long> result = TreeBasedTable.create(); LineIterator lineIterator = IOUtils.lineIterator(stream, "utf-8"); while (lineIterator.hasNext()) { String line = lineIterator.next(); System.out.println(line); String[] split = line.split("\t"); String language = split[0]; String license = split[1]; Long documents = Long.valueOf(split[2]); Long tokens = Long.valueOf(split[3]); result.put(language, "docs " + license, documents); result.put(language, "tokens " + license, tokens); }// www. ja v a 2 s . com return result; }
From source file:com.cirro.jsonjoin.utils.FileManager.java
public static <T extends Row> List<T> loadFile(File file, Class<T> valueType) throws IOException { List rowList = new ArrayList(); LineIterator it = FileUtils.lineIterator(file, "UTF-8"); while (it.hasNext()) { String line = it.nextLine(); Row row = convertToRow(line, valueType); rowList.add(row);/* w w w .java2 s .co m*/ } return rowList; }
From source file:com.cirro.jsonjoin.utils.FileManager.java
public static <T extends Row> Stream<T> loadFileStream(File file, Class<T> valueType) throws IOException { List rowList = new ArrayList(); LineIterator it = FileUtils.lineIterator(file, "UTF-8"); while (it.hasNext()) { String line = it.nextLine(); Row row = convertToRow(line, valueType); rowList.add(row);/*ww w . j a v a 2 s. c om*/ } return rowList.stream(); }
From source file:net.femtoparsec.jwhois.JWhoIsTest.java
private static void dumpAsText(byte[] bytes) { LineIterator iterator = IOUtils.lineIterator(new InputStreamReader(new ByteArrayInputStream(bytes))); while (iterator.hasNext()) { System.out.println(iterator.nextLine()); }// www . j ava2 s . com }