List of usage examples for org.apache.commons.io LineIterator nextLine
public String nextLine()
Reader
. From source file:es.ua.dlsi.lexicalinformation.Corpus.java
/** * Method that retrieves all the lines containing a given surface form in the * corpus./* w w w . j av a 2 s.co m*/ * @param word Word to be searched in the corpus * @return Returns the set of lines containing a given surface form in the * corpus. */ public Set<String> GetAllExamples(String word) { Set<String> examples = new LinkedHashSet<String>(); LineIterator corpus_it = null; try { corpus_it = FileUtils.lineIterator(new File(this.path)); } catch (FileNotFoundException ex) { System.err.println("Error while trying to open '" + this.path + "' file."); System.exit(-1); } catch (IOException ex) { System.err.println("Error while reading '" + this.path + "' file."); System.exit(-1); } while (corpus_it.hasNext()) { String line = corpus_it.nextLine(); //If the surface form appears in the sentence... if (line.matches("^" + word + " .*") || line.matches(".* " + word + "$") || line.matches(".* " + word + " .*")) { examples.add(line); } } corpus_it.close(); return examples; }
From source file:eu.annocultor.converters.geonames.GeonamesCsvToRdf.java
void features() throws Exception { System.out.println("Parsing features"); // load country-continent match countryToContinent.load(//from www .j av a 2 s .co m (new GeonamesCsvToRdf("EU")).getClass().getResourceAsStream("/country-to-continent.properties")); createDirsForContinents(); long counter = 0; LineIterator it = FileUtils.lineIterator(new File(root, "allCountries.txt"), "UTF-8"); try { while (it.hasNext()) { String text = it.nextLine(); String[] fields = text.split("\t"); if (fields.length != 19) { throw new Exception("Field names mismatch on " + text); } // progress counter++; if (counter % 100000 == 0) { System.out.print("*"); } String country = fields[countryCode]; String continent = countryToContinent.getProperty(country); if (continent != null && continent.startsWith(continentToConvert)) { String id = fields[geonameid]; String uri = NS_GEONAMES_INSTANCES + id + "/"; String featureCodeField = fields[featureClass] + "." + fields[featureCode]; String populationValue = fields[population]; if (includeRecordInConversion(featureCodeField, populationValue)) { boolean isDescriptionOfCountry = featureCodeField.startsWith("A.PCLI"); if (!fields[name].isEmpty()) { write(country, new Triple(uri, SKOS.LABEL_PREFERRED, new LiteralValue(fields[name]), null), isDescriptionOfCountry); } // String altLabels[] = fields[alternatenames].split(","); // for (String altLabel : altLabels) { // write(country, new Triple(uri, SKOS.LABEL_ALT, new LiteralValue(altLabel), null)); // } Collection<LiteralValue> altLabelCollection = altLabels.getCollection(id); if (altLabelCollection != null) { for (LiteralValue xmlValue : altLabelCollection) { write(country, new Triple(uri, SKOS.LABEL_ALT, xmlValue, null), isDescriptionOfCountry); } altLabels.remove(id); } Collection<String> linkCollection = links.getCollection(id); if (linkCollection != null) { for (String link : linkCollection) { // write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "link"), new LiteralValue(link), null)); } linkCollection.remove(fields[geonameid]); } if (fields[population].length() > 1) { write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "population"), new LiteralValue(fields[population]), null), isDescriptionOfCountry); } if (!fields[longitude].isEmpty()) { write(country, new Triple(uri, new Property(NS_WGS_SCHEMA + "long"), new LiteralValue(fields[longitude]), null), isDescriptionOfCountry); } if (!fields[latitude].isEmpty()) { write(country, new Triple(uri, new Property(NS_WGS_SCHEMA + "lat"), new LiteralValue(fields[latitude]), null), isDescriptionOfCountry); } if (!featureCodeField.isEmpty()) { write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "division"), new ResourceValue(NS_GEONAMES_ONTOLOGY + featureCodeField), null), isDescriptionOfCountry); } if (!country.isEmpty()) { write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "country"), new LiteralValue(country), null), isDescriptionOfCountry); } // alt label as country code if (featureCodeField.startsWith("A.PCL")) { write(country, new Triple(uri, SKOS.LABEL_ALT, new LiteralValue(country), null), isDescriptionOfCountry); } for (String broaderUri : allParents(uri, country)) { write(country, new Triple(uri, Concepts.DCTEMRS.IS_PART_OF, new ResourceValue(broaderUri), null), isDescriptionOfCountry); } // if (!fields[admin1code].isEmpty()) { // write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "admin1"), new LiteralValue(fields[admin1code]), null), isDescriptionOfCountry); // } // if (!fields[admin2code].isEmpty()) { // write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "admin2"), new LiteralValue(fields[admin2code]), null), isDescriptionOfCountry); // } // if (!fields[admin3code].isEmpty()) { // write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "admin3"), new LiteralValue(fields[admin3code]), null), isDescriptionOfCountry); // } // if (!fields[admin4code].isEmpty()) { // write(country, new Triple(uri, new Property(NS_EUROPEANA_SCHEMA + "admin4"), new LiteralValue(fields[admin4code]), null), isDescriptionOfCountry); // } } } } } finally { LineIterator.closeQuietly(it); } System.out.println("Finished conversion, flushing and closing output files"); System.out.flush(); for (Object country : countryToContinent.keySet()) { SesameWriter bf = files.get(country.toString()); if (bf != null) { bf.endRDF(); } } if (allCountries != null) { allCountries.endRDF(); } }
From source file:es.ua.dlsi.lexicalinformation.Corpus.java
/** * Method that retrieves all the lines in the corpus containing any of the * surface forms produced by a given candidate. * @param c Candidate generating the surface forms to be searched * @param dic Dictionary form which the candidate is extracted * @return Returns all the lines in the corpus containing any of the surface forms * produced by a given candidate/*from www .jav a2s .c o m*/ */ public Set<String> GetAllExamplesOfInflections(Candidate c, Dictionary dic) { Set<String> inflectedwordforms = c.GetSurfaceForms(dic); Set<String> examples = new LinkedHashSet<String>(); LineIterator corpus_it = null; try { corpus_it = FileUtils.lineIterator(new File(this.path)); } catch (FileNotFoundException ex) { System.err.println("Error while trying to open '" + this.path + "' file."); System.exit(-1); } catch (IOException ex) { System.err.println("Error while reading '" + this.path + "' file."); System.exit(-1); } while (corpus_it.hasNext()) { String line = corpus_it.nextLine(); for (String word : inflectedwordforms) { //If the surface form appears in the sentence... if (line.matches("^" + word + " .*") || line.matches(".* " + word + "$") || line.matches(".* " + word + " .*")) { examples.add(line); } } } corpus_it.close(); return examples; }
From source file:fr.ericlab.mabed.structure.Corpus.java
public void loadCorpus(boolean parallelized) { output = "";/* w w w.jav a2 s . c om*/ if (configuration.prepareCorpus) { prepareCorpus(); } String[] fileArray = new File("input/").list(); nbTimeSlices = 0; NumberFormat formatter = new DecimalFormat("00000000"); ArrayList<Integer> list = new ArrayList<>(); for (String filename : fileArray) { if (filename.endsWith(".text")) { try { list.add(formatter.parse(filename.substring(0, 8)).intValue()); } catch (ParseException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } nbTimeSlices++; } } int a = Collections.min(list), b = Collections.max(list); distribution = new int[nbTimeSlices]; messageCount = 0; LineIterator it = null; try { it = FileUtils.lineIterator(new File("input/" + formatter.format(a) + ".time"), "UTF-8"); if (it.hasNext()) { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); Date parsedDate = dateFormat.parse(it.nextLine()); startTimestamp = new java.sql.Timestamp(parsedDate.getTime()); } it = FileUtils.lineIterator(new File("input/" + formatter.format(b) + ".time"), "UTF-8"); String timestamp = ""; while (it.hasNext()) { timestamp = it.nextLine(); } SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); Date parsedDate = dateFormat.parse(timestamp); endTimestamp = new java.sql.Timestamp(parsedDate.getTime()); } catch (IOException | ParseException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } finally { LineIterator.closeQuietly(it); } try { // Global index FileInputStream fisMatrix = new FileInputStream("input/indexes/frequencyMatrix.dat"); ObjectInputStream oisMatrix = new ObjectInputStream(fisMatrix); frequencyMatrix = (short[][]) oisMatrix.readObject(); FileInputStream fisVocabulary = new FileInputStream("input/indexes/vocabulary.dat"); ObjectInputStream oisVocabulary = new ObjectInputStream(fisVocabulary); vocabulary = (ArrayList<String>) oisVocabulary.readObject(); // Mention index FileInputStream fisMentionMatrix = new FileInputStream("input/indexes/mentionFrequencyMatrix.dat"); ObjectInputStream oisMentionMatrix = new ObjectInputStream(fisMentionMatrix); mentionFrequencyMatrix = (short[][]) oisMentionMatrix.readObject(); FileInputStream fisMentionVocabulary = new FileInputStream("input/indexes/mentionVocabulary.dat"); ObjectInputStream oisMentionVocabulary = new ObjectInputStream(fisMentionVocabulary); mentionVocabulary = (ArrayList<String>) oisMentionVocabulary.readObject(); // Message count String messageCountStr = FileUtils.readFileToString(new File("input/indexes/messageCount.txt")); messageCount = Integer.parseInt(messageCountStr); // Message count distribution FileInputStream fisDistribution = new FileInputStream("input/indexes/messageCountDistribution.dat"); ObjectInputStream oisDistribution = new ObjectInputStream(fisDistribution); distribution = (int[]) oisDistribution.readObject(); } catch (FileNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException | ClassNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } DecimalFormat df = new DecimalFormat("#,###"); System.out.println(Util.getDate() + " Loaded corpus:"); output += Util.getDate() + " Loaded corpus:\n"; info = " - time-slices: " + df.format(nbTimeSlices) + " time-slices of " + configuration.timeSliceLength + " minutes each\n"; info += " - first message: " + startTimestamp + "\n"; double datasetLength = (nbTimeSlices * configuration.timeSliceLength) / 60 / 24; info += " - last message: " + endTimestamp + " (" + datasetLength + " days)\n"; info += " - number of messages: " + df.format(messageCount); output += info; System.out.println(info); }
From source file:net.mindengine.blogix.web.tiles.TilesContainer.java
private TileLine readAllTileLines(File file) throws IOException { LineIterator it = FileUtils.lineIterator(file, "UTF-8"); /**/*from www .ja v a 2s.c o m*/ * Setting a root tile which will be a container for all tiles */ TileLine rootTileLine = new TileLine(); rootTileLine.indentation = -1; TileLine currentTileLine = rootTileLine; try { while (it.hasNext()) { String line = it.nextLine(); TileLine tileLine = readKeyValue(currentTileLine, line); if (tileLine != null) { currentTileLine = tileLine; } } } finally { LineIterator.closeQuietly(it); } return rootTileLine; }
From source file:com.ipcglobal.fredimport.process.Reference.java
/** * Creates the ref currencies countries. * * @param path the path//from w ww.jav a 2 s. c o m * @throws Exception the exception */ private void createRefCurrenciesCountries(String path) throws Exception { refCountriesByCurrencies = new HashMap<String, String>(); refCurrenciesByCountries = new HashMap<String, String>(); LineIterator it = FileUtils.lineIterator(new File(path + FILENAME_CURRENCIES_COUNTRIES), "UTF-8"); try { while (it.hasNext()) { // Format: <countryName>|<primaryCurrencyName>|<akaCurrencyName1>|<akaCurrencyName2>|... String line = it.nextLine(); String[] fields = line.split("[|]"); for (int i = 0; i < fields.length; i++) fields[i] = fields[i].trim(); fields[1] = fields[1].trim(); // When looking up by country, always return the primary currency refCurrenciesByCountries.put(fields[0], fields[1]); for (int i = 1; i < fields.length; i++) refCountriesByCurrencies.put(fields[i], fields[0]); } } finally { LineIterator.closeQuietly(it); } }
From source file:edu.cornell.med.icb.goby.modes.TrimMode.java
@Override public void execute() throws IOException { ReadsReader reader = null;//from w w w . ja v a 2 s . c o m ReadsWriter writer = null; final ProgressLogger progress = new ProgressLogger(LOG); try { reader = new ReadsReader(inputFilename); final LineIterator lines = new LineIterator(new FileReader(adapterFilename)); final ObjectArrayList<MutableString> adapterList = new ObjectArrayList<MutableString>(); while (lines.hasNext()) { final String next = lines.nextLine(); adapterList.add(new MutableString(next)); } final MutableString[] adapters; if (complementAdapters) { adapters = addComplementAdapters(adapterList); } else { adapters = adapterList.toArray(new MutableString[adapterList.size()]); } progress.start(); writer = new ReadsWriterImpl(new FileOutputStream(outputFilename)); final ByteArrayList newQualScores = new ByteArrayList(); final ByteArrayList newPairQualScores = new ByteArrayList(); final MutableString sequence = new MutableString(); final MutableString sequencePair = new MutableString(); for (final Reads.ReadEntry entry : reader) { // observe(counters, entry.getSequence(), entry.getReadIndex()); ReadsReader.decodeSequence(entry, sequence); final ByteString qualityScores = entry.getQualityScores(); newQualScores.clear(); final MutableString seq1 = trim(adapters, newQualScores, sequence, qualityScores); MutableString pairSeq = null; numSequencesInInput++; if (entry.hasSequencePair()) { newPairQualScores.clear(); ReadsReader.decodeSequence(entry, sequencePair, true); final ByteString pairQualityScores = entry.getQualityScoresPair(); pairSeq = trim(adapters, newPairQualScores, sequencePair, pairQualityScores); numSequencesInInput++; } // System.out.printf(">seq%n%s%n", c); Reads.ReadEntry.Builder builder = Reads.ReadEntry.newBuilder(); builder = builder.mergeFrom(entry).setSequence(ReadsWriterImpl.encodeSequence(seq1, buffer)) .setReadLength(seq1.length()); if (sequence.length() != seq1.length()) { numTrimmed++; final byte[] bytes1 = newQualScores.toByteArray(); builder = builder.setQualityScores(ByteString.copyFrom(bytes1)); assert builder.getQualityScores().size() == builder.getSequence() .size() : "sequence length and quality scores must match."; } if (entry.hasSequencePair()) { builder = builder.mergeFrom(entry) .setSequencePair(ReadsWriterImpl.encodeSequence(pairSeq, buffer)) .setReadLength(pairSeq.length()); if (sequencePair.length() != pairSeq.length()) { numTrimmed++; builder = builder .setQualityScoresPair(ByteString.copyFrom(newPairQualScores.toByteArray())); assert builder.getQualityScoresPair().size() == builder.getSequencePair() .size() : "sequence length and quality scores must match."; } } if (seq1.length() > 0 || sequencePair.length() > 0) { // some sequence must remain to append to the output: writer.appendEntry(builder); } progress.lightUpdate(); } progress.stop(); final int numSequencesTrimmed = numTrimmed; double percent = 100d * numSequencesTrimmed; percent /= numSequencesInInput; System.out.printf( "Number of reads trimmed %d (%g %% of input sequences), including: %n" + "left: %d (%g%%)%n" + "right: %d (%g%%), %n" + "fully contained: %d (%g%%)%n", numSequencesTrimmed, percent, numTrimmedLeft, percent(numTrimmedLeft, numSequencesTrimmed), numTrimmedRight, percent(numTrimmedRight, numSequencesTrimmed), numContained, percent(numContained, numSequencesTrimmed)); System.out.flush(); } finally { if (writer != null) { writer.close(); } } progress.stop(); }
From source file:net.orzo.lib.Files.java
/** * Obtains an iterator which reads provided file (specified by path) line by * line. Iterator can be accessed by a classic method pair <i>hasNext()</li> * and <i>next()</i>.//from w w w.ja v a 2s. com */ public FileIterator<Object> fileReader(final String path, final String encoding) throws IOException { final LineIterator itr = FileUtils.lineIterator(new File(path), encoding); return new FileIterator<Object>() { @Override public boolean hasNext() { return itr.hasNext(); } @Override public Object next() { return itr.nextLine(); // TODO wrapping??? } @Override public void remove() { itr.remove(); } public void close() { itr.close(); } public String getPath() { if (File.separator.equals("/")) { return path; } else { return path.replace(File.separator, "/"); } } }; }
From source file:com.googlecode.jgenhtml.CoverageReport.java
/** * Parses a gcov tracefile.//from w ww . j a va2s . c o m * @param traceFile A gcov tracefile. * @param isDescFile true if this is a descriptions (.desc) file. * @param isBaseFile true if this is a baseline file. */ private void parseDatFile(final File traceFile, final boolean isDescFile, final boolean isBaseFile) throws IOException, ParserConfigurationException { //I used the info from here: http://manpages.ubuntu.com/manpages/precise/man1/geninfo.1.html File fileToProcess; if (traceFile.getName().endsWith(".gz")) { LOGGER.log(Level.FINE, "File {0} ends with .gz, going to gunzip it.", traceFile.getName()); fileToProcess = JGenHtmlUtils.gunzip(traceFile); } else { fileToProcess = traceFile; } LineIterator iterator = FileUtils.lineIterator(fileToProcess); try { TestCaseSourceFile testCaseSourceFile = null; String testCaseName = DEFAULT_TEST_NAME; while (iterator.hasNext()) { String line = iterator.nextLine(); int tokenIdx = line.indexOf("SF:"); if (tokenIdx >= 0 || (tokenIdx = line.indexOf("KF:")) >= 0) { String fullPath = line.substring(line.indexOf(tokenIdx) + 4); File sourceFile = new File(fullPath); fullPath = sourceFile.getCanonicalPath(); testCaseSourceFile = parsedFiles.get(fullPath); if (!isBaseFile && testCaseSourceFile == null) { testCaseSourceFile = new TestCaseSourceFile(testTitle, sourceFile.getName()); testCaseSourceFile.setSourceFile(sourceFile); parsedFiles.put(fullPath, testCaseSourceFile); } } else if (line.indexOf("end_of_record") >= 0) { if (testCaseSourceFile != null) { testCaseName = DEFAULT_TEST_NAME; testCaseSourceFile = null; } else { LOGGER.log(Level.FINE, "Unexpected end of record"); } } else if (testCaseSourceFile != null) { testCaseSourceFile.processLine(testCaseName, line, isBaseFile); } else { if (isDescFile) { descriptionsPage.addLine(line); } else if (line.startsWith("TN:")) { String[] data = JGenHtmlUtils.extractLineValues(line); testCaseName = data[0].trim(); if (testCaseName.length() > 0) { if (runTestNames == null) { runTestNames = new HashSet<String>(); } runTestNames.add(testCaseName); } } else { LOGGER.log(Level.FINE, "Unexpected line: {0}", line); } } } } finally { LineIterator.closeQuietly(iterator); } }
From source file:net.sf.logsaw.dialect.websphere.WebsphereDialect.java
@Override public void parse(ILogResource log, InputStream input, ILogEntryCollector collector) throws CoreException { Assert.isNotNull(log, "log"); //$NON-NLS-1$ Assert.isNotNull(input, "input"); //$NON-NLS-1$ Assert.isNotNull(collector, "collector"); //$NON-NLS-1$ Assert.isTrue(isConfigured(), "Dialect should be configured by now"); //$NON-NLS-1$ try {// w w w . j ava2 s . c om LogEntry currentEntry = null; IHasEncoding enc = (IHasEncoding) log.getAdapter(IHasEncoding.class); IHasLocale loc = (IHasLocale) log.getAdapter(IHasLocale.class); // WebSphere Dialect doesn't need to care about the timezone, because it is encoded in the log messages DateFormat df = getDateFormat(loc.getLocale()); LineIterator iter = IOUtils.lineIterator(input, enc.getEncoding()); int lineNo = 0; try { while (iter.hasNext()) { // Error handling lineNo++; List<IStatus> statuses = null; boolean fatal = false; // determines whether to interrupt parsing String line = iter.nextLine(); Matcher m = getInternalPattern().matcher(line); if (m.find()) { // The next line matches, so flush the previous entry and continue if (currentEntry != null) { collector.collect(currentEntry); currentEntry = null; } currentEntry = new LogEntry(); for (int i = 0; i < m.groupCount(); i++) { try { extractField(currentEntry, i + 1, m.group(i + 1), df); } catch (CoreException e) { // Mark for interruption fatal = fatal || e.getStatus().matches(IStatus.ERROR); // Messages will be displayed later if (statuses == null) { statuses = new ArrayList<IStatus>(); } if (e.getStatus().isMultiStatus()) { Collections.addAll(statuses, e.getStatus().getChildren()); } else { statuses.add(e.getStatus()); } } } // We encountered errors or warnings if (statuses != null && !statuses.isEmpty()) { currentEntry = null; // Stop propagation IStatus status = new MultiStatus(WebsphereDialectPlugin.PLUGIN_ID, 0, statuses.toArray(new IStatus[statuses.size()]), NLS.bind(Messages.WebsphereDialect_error_failedToParseLine, lineNo), null); if (fatal) { // Interrupt parsing in case of error throw new CoreException(status); } else { collector.addMessage(status); } } } else if (currentEntry != null) { // Append to message String msg = currentEntry.get(getFieldProvider().getMessageField()); StringWriter strWriter = new StringWriter(); PrintWriter printWriter = new PrintWriter(strWriter); printWriter.print(msg); printWriter.println(); printWriter.print(line); currentEntry.put(getFieldProvider().getMessageField(), strWriter.toString()); } if (collector.isCanceled()) { // Cancel parsing break; } } if (currentEntry != null) { // Collect left over entry collector.collect(currentEntry); } } finally { LineIterator.closeQuietly(iter); } } catch (Exception e) { throw new CoreException(new Status(IStatus.ERROR, WebsphereDialectPlugin.PLUGIN_ID, NLS.bind(Messages.WebsphereDialect_error_failedToParseFile, new Object[] { log.getName(), e.getLocalizedMessage() }), e)); } }