List of usage examples for org.apache.commons.io LineIterator next
public Object next()
Reader
. From source file:com.adobe.acs.tools.csv_asset_importer.impl.CsvAssetImporterServlet.java
/** * Adds a populated terminating field to the ends of CSV entries. * If the last entry in a CSV row is empty, the CSV library has difficulty understanding that is the end of the row. * * @param is the CSV file as an inputstream * @param separator The field separator// www .j a v a 2 s.co m * @param charset The charset * @return An inputstream that is the same as is, but each line has a populated line termination entry * @throws IOException */ private InputStream terminateLines(final InputStream is, final char separator, final String charset) throws IOException { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(baos); final LineIterator lineIterator = IOUtils.lineIterator(is, charset); while (lineIterator.hasNext()) { String line = StringUtils.stripToNull(lineIterator.next()); if (line != null) { line += separator + TERMINATED; printStream.println(line); } } return new ByteArrayInputStream(baos.toByteArray()); }
From source file:de.tudarmstadt.lt.seg.app.Segmenter.java
private void run_sequential_line() throws Exception { ISentenceSplitter sentenceSplitter = newSentenceSplitter(); ITokenizer tokenizer = newTokenizer(); InputStream in = System.in; if (!"-".equals(_filename_in)) in = new FileInputStream(_filename_in); LineIterator liter = new LineIterator( new BufferedReader(new InputStreamReader(in, Charset.defaultCharset()))); OutputStream out = System.out; if (!"-".equals(_filename_out)) out = new FileOutputStream(_filename_out); PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset())); for (long lc = 0; liter.hasNext();) { if (++lc % 1000 == 0) System.err.format("Processing line %d ('%s')%n", lc, _filename_in); String l = liter.next().replace("\\t", "\t").replace("\\n", "\n"); split_and_tokenize(new StringReader(l), String.format("%s:%d", _filename_in, lc), sentenceSplitter, tokenizer, _level_filter, _level_normalize, _merge_types, _merge_tokens, _separator_sentence, _separator_token, _separator_desc, w); }//from ww w.j a va 2s.co m }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
public void create_vocabulary_index(File vocabulary_file) throws IOException { File index_dir = new File(_index_dir, "vocab"); if (index_dir.exists()) { LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*from w w w . ja v a 2s. com*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(1024.0); Directory directory = new MMapDirectory(index_dir); IndexWriter writer_vocab = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(vocabulary_file); if (vocabulary_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_word = new StringField("word", "", Field.Store.YES); doc.add(f_word); long c = 0; while (iter.hasNext()) { if (++c % 10000 == 0) LOG.info("Adding {}'th word.", c); String line = iter.next(); try { String word = line.trim(); f_word.setStringValue(word); writer_vocab.addDocument(doc); } catch (Exception e) { LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e); } } writer_vocab.forceMergeDeletes(); writer_vocab.commit(); writer_vocab.close(); }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv3Reader.java
/** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation *///w w w . j a v a2 s . c o m private void setAnnotations(JCas aJCas, InputStream aIs, String aEncoding) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int sentBegin = -1, sentEnd = 0; int prevSentEnd = 0; StringBuilder sentLineSb = new StringBuilder(); String lastSent = ""; while (lineIterator.hasNext()) { String line = lineIterator.next(); if (line.startsWith("#T_")) { setLayerAndFeature(aJCas, line); continue; } if (line.startsWith("#Text=")) { if (sentLineSb.toString().isEmpty()) { sentLineSb.append(line.substring(line.indexOf("=") + 1)); } else { sentLineSb.append(LF + line.substring(line.indexOf("=") + 1)); } lastSent = sentLineSb.toString(); continue; } if (line.startsWith("#FORMAT=")) { continue; } if (line.trim().isEmpty()) { if (!sentLineSb.toString().isEmpty()) { createSentence(aJCas, sentLineSb.toString(), sentBegin, sentEnd, prevSentEnd); prevSentEnd = sentEnd; sentBegin = -1;// reset for next sentence begin sentLineSb = new StringBuilder(); } continue; } line = line.trim(); int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } String regex = "(?<!\\\\)*" + Pattern.quote(TAB); String[] lines = line.split(regex); int begin = Integer.parseInt(lines[1].split("-")[0]); int end = Integer.parseInt(lines[1].split("-")[1]); if (sentBegin == -1) { sentBegin = begin; } sentEnd = end; AnnotationUnit unit = createTokens(aJCas, lines, begin, end); int ind = 3; setAnnosPerTypePerUnit(lines, unit, ind); } // the last sentence if (!lastSent.isEmpty()) { createSentence(aJCas, lastSent, sentBegin, sentEnd, prevSentEnd); } Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> annosPerTypePerUnit = new HashMap<>(); setAnnosPerUnit(aJCas, annosPerTypePerUnit); addAnnotations(aJCas, annosPerTypePerUnit); addChainAnnotations(aJCas); }
From source file:edu.cornell.med.icb.goby.modes.EmpiricalPMode.java
private void scan() throws FileNotFoundException { LineIterator iterator = new LineIterator(new FastBufferedReader(new FileReader(inputFilename))); int lineNumber = 0; ObjectArrayList<String> elementIds = new ObjectArrayList<String>(); IntArrayList valuesA = new IntArrayList(); IntArrayList valuesB = new IntArrayList(); IntArrayList covariatesA = new IntArrayList(); IntArrayList covariatesB = new IntArrayList(); counter = new FormatFieldCounter(0, 2, 2, new String[] { "ALL" }); setupOutput();//from w ww. java 2 s .com // ignore the header line: iterator.next(); ProgressLogger pg = new ProgressLogger(LOG); pg.displayFreeMemory = true; pg.itemsName = "pairs"; pg.expectedUpdates = countLines(inputFilename) - 1; pg.start("Starting to scan pairs."); while (iterator.hasNext()) { String next = iterator.nextLine(); String[] tokens = next.split("\t"); boolean pastIds = false; boolean pastValues = false; String typeOfPairString = tokens[0]; ObservationWriter.TypeOfPair typeOfPair = ObservationWriter.TypeOfPair.UNDEFINED; for (int i = 0; i < tokens.length; i++) { try { typeOfPair = ObservationWriter.TypeOfPair.valueOf(typeOfPairString); } catch (IllegalArgumentException e) { System.err.println( "First token of every line should be WITHIN_GROUP_PAIR or BETWEEN_GROUP_PAIR. Found " + typeOfPairString + " on line " + lineNumber); System.exit(1); } elementIds.clear(); valuesA.clear(); valuesB.clear(); covariatesA.clear(); covariatesB.clear(); int j; String groupComparison = tokens[1]; elementIds.add(groupComparison); for (j = 2; !"VALUES_A".equals(tokens[j]); j++) { if (j == tokens.length) { break; } elementIds.add(tokens[j]); } if (j == tokens.length) { System.err.println( "Every line must contain the VALUES keyword. Keyword not found on line " + lineNumber); System.exit(1); } j++; for (; !"VALUES_B".equals(tokens[j]); j++) { if (j == tokens.length) { break; } valuesA.add(Integer.parseInt(tokens[j])); } j++; for (; !"COVARIATES_A".equals(tokens[j]); j++) { if (j == tokens.length) { break; } valuesB.add(Integer.parseInt(tokens[j])); } if (j == tokens.length) { System.err .println("Every line must contain the COVARIATES_A keyword. Keyword not found on line " + lineNumber); System.exit(1); } j++; for (; !"COVARIATES_B".equals(tokens[j]); j++) { if (j == tokens.length) { break; } covariatesA.add(Integer.parseInt(tokens[j])); } if (j == tokens.length) { System.err .println("Every line must contain the COVARIATES_B keyword. Keyword not found on line " + lineNumber); System.exit(1); } j++; for (; j < tokens.length; j++) { covariatesB.add(Integer.parseInt(tokens[j])); } } lineNumber++; final String groupComparison = elementIds.get(0); process(typeOfPair, groupComparison, elementIds, valuesA, valuesB, covariatesA, covariatesB); pg.lightUpdate(); } pg.done(lineNumber); }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoCustomTsvReader.java
/** * Iterate through lines and create span annotations accordingly. For multiple span annotation, * based on the position of the annotation in the line, update only the end position of the * annotation//from w w w . j av a 2s .co m */ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv2Reader.java
/** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation *//*from w w w. ja va2 s .c o m*/ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation // annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma // and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which // break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the // sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:de.tudarmstadt.ukp.clarin.webanno.api.dao.RepositoryServiceDbData.java
/** * Check if a TAB-Sep training file is in correct format before importing *//*from w w w. j a va2 s .c om*/ private boolean isTabSepFileFormatCorrect(File aFile) { try { LineIterator it = new LineIterator(new FileReader(aFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().length() == 0) { continue; } if (line.split("\t").length != 2) { return false; } } } catch (Exception e) { return false; } return true; }
From source file:dk.netarkivet.archive.arcrepositoryadmin.ReplicaCacheDatabase.java
/** * Method for adding the results from a list of filenames on a replica. This list of filenames should return the * list of all the files within the database. * <p>/*from w w w . ja v a 2s . c om*/ * For each file in the FileListJob the following fields are set for the corresponding entry in the replicafileinfo * table: <br/> * - filelist_status = ok. <br/> * - filelist_checkdatetime = now. * <p> * For each entry in the replicafileinfo table for the replica which are missing in the results from the FileListJob * the following fields are assigned the following values: <br/> * - filelist_status = missing. <br/> * - filelist_checkdatetime = now. * * @param filelistFile The list of filenames either parsed from a FilelistJob or the result from a * GetAllFilenamesMessage. * @param replica The replica, which the FilelistBatchjob has run upon. * @throws ArgumentNotValid If the filelist or the replica is null. * @throws UnknownID If the replica does not already exist in the database. */ @Override public void addFileListInformation(File filelistFile, Replica replica) throws ArgumentNotValid, UnknownID { ArgumentNotValid.checkNotNull(filelistFile, "File filelistFile"); ArgumentNotValid.checkNotNull(replica, "Replica replica"); // Sort the filelist file. File sortedResult = new File(filelistFile.getParent(), filelistFile.getName() + ".sorted"); FileUtils.sortFile(filelistFile, sortedResult); final long datasize = FileUtils.countLines(sortedResult); Connection con = ArchiveDBConnection.get(); Set<Long> missingReplicaRFIs = null; LineIterator lineIterator = null; try { // Make sure, that the replica exists in the database. if (!ReplicaCacheHelpers.existsReplicaInDB(replica, con)) { String errorMsg = "Cannot add filelist information, since the replica '" + replica.toString() + "' does not exist in the database."; log.warn(errorMsg); throw new UnknownID(errorMsg); } log.info("Starting processing of {} filelist entries for replica {}", datasize, replica.getId()); // retrieve the list of files already known by this cache. // TODO This does not scale! Should this datastructure // (missingReplicaRFIs) be disk-bound in some way. missingReplicaRFIs = ReplicaCacheHelpers.retrieveReplicaFileInfoGuidsForReplica(replica.getId(), con); // Initialize String iterator lineIterator = new LineIterator(new FileReader(sortedResult)); String lastFileName = ""; int i = 0; while (lineIterator.hasNext()) { String file = lineIterator.next(); // log that it is in progress every so often. if ((i % LOGGING_ENTRY_INTERVAL) == 0) { log.info("Processed file list entry number {} for replica {}", i, replica); // Close connection, and open another one // to avoid memory-leak (NAS-2003) ArchiveDBConnection.release(con); con = ArchiveDBConnection.get(); log.debug("Databaseconnection has now been renewed"); } ++i; // handle duplicates. if (file.equals(lastFileName)) { log.warn("There have been found multiple files with the name '{}'", file); continue; } lastFileName = file; // Add information for one file, and remove the ReplicaRFI from the // set of missing ones. missingReplicaRFIs.remove(ReplicaCacheHelpers.addFileInformation(file, replica, con)); } } catch (IOException e) { throw new IOFailure("Unable to read the filenames from file", e); } finally { ArchiveDBConnection.release(con); LineIterator.closeQuietly(lineIterator); } con = ArchiveDBConnection.get(); try { // go through the not found replicafileinfo for this replica to change // their filelist_status to missing. if (missingReplicaRFIs.size() > 0) { log.warn("Found {} missing files for replica '{}'.", missingReplicaRFIs.size(), replica); for (long rfi : missingReplicaRFIs) { // set the replicafileinfo in the database to missing. ReplicaCacheHelpers.updateReplicaFileInfoMissingFromFilelist(rfi, con); } } // Update the date for filelist update for this replica. ReplicaCacheHelpers.updateFilelistDateForReplica(replica, con); } finally { ArchiveDBConnection.release(con); } }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();// ww w.j a v a2 s . c om } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }