List of usage examples for org.apache.commons.io LineIterator hasNext
public boolean hasNext()
Reader
has more lines. From source file:com.thalesgroup.hudson.plugins.klocwork.KloSource.java
/** * Splits the source code into three blocks: the line to highlight and the * source code before and after this line. * * @param sourceFile the source code of the whole file as rendered HTML string *//* ww w . j ava2 s. co m*/ public final void splitSourceFile(final String sourceFile) { StringBuilder output = new StringBuilder(sourceFile.length()); KloFile kloFile = kloWorkspaceFile.getKloFile(); LineIterator lineIterator = IOUtils.lineIterator(new StringReader(sourceFile)); int lineNumber = 1; //---header while (lineNumber < SOURCE_GENERATOR_OFFSET) { copyLine(output, lineIterator); lineNumber++; } lineNumber = 1; //---iterate before the error line while (lineNumber < Integer.parseInt(((String) kloFile.get("line")))) { copyLine(output, lineIterator); lineNumber++; } output.append("</code>\n"); //---Error message output.append("</td></tr>\n"); output.append("<tr><td bgcolor=\""); appendRangeColor(output); output.append("\">\n"); output.append("<div tooltip=\""); //AM //outputEscaped(output, kloFile.getProblemId()+":"+kloFile.getMessage()); outputEscaped(output, kloFile.get("problemID") + ":" + kloFile.get("message")); output.append("\" nodismiss=\"\">\n"); output.append("<code><b>\n"); //The current line error copyLine(output, lineIterator); lineNumber++; //End of the code output.append("</b></code>\n"); output.append("</div>\n"); output.append("</td></tr>\n"); output.append("<tr><td>\n"); output.append("<code>\n"); while (lineIterator.hasNext()) { copyLine(output, lineIterator); } output.append("</code>\n"); output.append("</td></tr>\n"); sourceCode = output.toString(); }
From source file:com.datamelt.nifi.processors.ExecuteRuleEngine.java
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { // map used to store the attribute name and its value from the content of the flow file final Map<String, String> propertyMap = new HashMap<>(); // get a logger instance final ComponentLog logger = getLogger(); // a header from the content if present final AtomicReference<HeaderRow> header = new AtomicReference<>(); AtomicBoolean error = new AtomicBoolean(); // get the flow file FlowFile flowFile = session.get();/*from w w w . j av a2 s .c om*/ if (flowFile == null) { return; } // list of rows from splitting the original flow file content ArrayList<RuleEngineRow> flowFileRows = new ArrayList<RuleEngineRow>(); // list of rows containing the detailed results of the ruleengine ArrayList<RuleEngineRow> flowFileDetails = new ArrayList<RuleEngineRow>(); boolean headerPresent = context.getProperty(ATTRIBUTE_HEADER_PRESENT).getValue().equals("true"); // put the name of the ruleengine zip file in the list of properties propertyMap.put(PROPERTY_RULEENGINE_ZIPFILE_NAME, context.getProperty(ATTRIBUTE_RULEENGINE_ZIPFILE).getValue()); final int batchSize = Integer.parseInt(context.getProperty(BATCH_SIZE_NAME).getValue()); // read flow file into input stream session.read(flowFile, new InputStreamCallback() { public void process(InputStream in) throws IOException { try { // iterator over the lines from the input stream LineIterator iterator = IOUtils.lineIterator(in, "utf-8"); // check if configuration indicates that a header row is present in the flow file content if (headerPresent) { logger.debug("configuration indicates a header row is present in flow file content"); // if there is at least one row of data and the header is not defined yet if (iterator.hasNext() && header.get() == null) { // set the header from the content header.set(new HeaderRow(iterator.nextLine(), separator)); } } // if no header row is present in the flow file content else { logger.debug("configuration indicates no header row is present in flow file content"); // use the header from the field names header.set(headerFromFieldNames); } // loop over all rows of data while (iterator.hasNext()) { // we handle the error per row of data error.set(false); // get a row to process String row = iterator.nextLine(); // check that we have data if (row != null && !row.trim().equals("")) { RowFieldCollection rowFieldCollection = null; try { rowFieldCollection = getRowFieldCollection(row, header.get()); logger.debug("RowFieldCollection header contains: " + rowFieldCollection.getHeader().getNumberOfFields() + " fields"); logger.debug("RowFieldCollection contains: " + rowFieldCollection.getNumberOfFields() + " fields"); // run the ruleengine with the given data from the flow file logger.debug("running business ruleengine..."); // run the business logic/rules against the data ruleEngine.run("flowfile", rowFieldCollection); // add some debugging output that might be useful logger.debug("number of rulegroups: " + ruleEngine.getNumberOfGroups()); logger.debug( "number of rulegroups passed: " + ruleEngine.getNumberOfGroupsPassed()); logger.debug( "number of rulegroups failed: " + ruleEngine.getNumberOfGroupsFailed()); logger.debug( "number of rulegroups skipped: " + ruleEngine.getNumberOfGroupsSkipped()); logger.debug("number of rules: " + ruleEngine.getNumberOfRules()); logger.debug("number of rules passed: " + ruleEngine.getNumberOfRulesPassed()); logger.debug("number of rules failed: " + ruleEngine.getNumberOfRulesFailed()); logger.debug("number of actions: " + ruleEngine.getNumberOfActions()); // add some properties of the ruleengine execution to the map addRuleEngineProperties(propertyMap); } catch (Exception ex) { error.set(true); logger.error(ex.getMessage(), ex); } // if no error occurred we create a save the data for the creation of the flow files if (!error.get()) { // process only if the collection of fields was changed by // a ruleengine action. this means the data was updated so // we will have to re-write/re-create the flow file content. if (rowFieldCollection.isCollectionUpdated()) { // put an indicator that the data was modified by the ruleengine propertyMap.put(PROPERTY_RULEENGINE_CONTENT_MODIFIED, "true"); logger.debug( "data was modified - updating flow file content with ruleengine results"); // the RuleEngineRow instance will contain the row of data and the map of properties // and will later be used when the flow files are created flowFileRows .add(new RuleEngineRow(getResultRow(rowFieldCollection), propertyMap)); } else { // put an indicator that the data was NOT modified by the ruleengine propertyMap.put(PROPERTY_RULEENGINE_CONTENT_MODIFIED, "false"); logger.debug("data was not modified - using original content"); // the RuleEngineRow instance will contain the row of data and the map of properties // and will later be used when the flow files are created flowFileRows.add(new RuleEngineRow(row, propertyMap)); } if (flowFileRows.size() >= batchSize) { // generate flow files from the individual rows List<FlowFile> splitFlowFiles = generateFlowFileSplits(context, session, flowFileRows, header.get(), headerPresent); // transfer all individual rows to success relationship if (splitFlowFiles.size() > 0) { session.transfer(splitFlowFiles, SUCCESS); } } // if the user configured detailed results if (context.getProperty(ATTRIBUTE_OUTPUT_DETAILED_RESULTS).getValue() .equals("true")) { // get the configured output type String outputType = context.getProperty(ATTRIBUTE_OUTPUT_DETAILED_RESULTS_TYPE) .getValue(); logger.debug("configuration set to output detailed results with type [" + outputType + "]"); // we need to create a flow file only, if the ruleengine results are according to the output type settings if (outputType.equals(OUTPUT_TYPE_ALL_GROUPS_ALL_RULES) || (outputType.equals(OUTPUT_TYPE_FAILED_GROUPS_ALL_RULES) && ruleEngine.getNumberOfGroupsFailed() > 0) || (outputType.equals(OUTPUT_TYPE_FAILED_GROUPS_FAILED_RULES) && ruleEngine.getNumberOfGroupsFailed() > 0) || (outputType.equals(OUTPUT_TYPE_FAILED_GROUPS_PASSED_RULES) && ruleEngine.getNumberOfGroupsFailed() > 0) || (outputType.equals(OUTPUT_TYPE_PASSED_GROUPS_ALL_RULES) && ruleEngine.getNumberOfGroupsPassed() > 0) || (outputType.equals(OUTPUT_TYPE_PASSED_GROUPS_FAILED_RULES) && ruleEngine.getNumberOfGroupsPassed() > 0 || (outputType.equals(OUTPUT_TYPE_PASSED_GROUPS_PASSED_RULES) && ruleEngine.getNumberOfGroupsPassed() > 0))) { // create the content for the flow file String content = getFlowFileRuleEngineDetailsContent(header.get(), headerPresent, outputType, row); // add results to the list flowFileDetails.add(new RuleEngineRow(content, propertyMap)); if (flowFileDetails.size() >= batchSize) { List<FlowFile> detailsFlowFiles = generateFlowFilesRuleEngineDetails( context, session, flowFileDetails, header.get(), headerPresent); // transfer all individual rows to detailed relationship if (detailsFlowFiles.size() > 0) { session.transfer(detailsFlowFiles, DETAILED_RESULTS); } } } } // clear the collections of ruleengine results ruleEngine.getRuleExecutionCollection().clear(); } // if we have an error we create a flow file from the current row of data and send it to the failure relationsship else { FlowFile failureFlowFile = generateFailureFlowFile(context, session, row, header.get(), headerPresent); session.transfer(failureFlowFile, FAILURE); } } } LineIterator.closeQuietly(iterator); } catch (Exception ex) { ex.printStackTrace(); logger.error("error running the business ruleengine", ex); } } }); // generate flow files from the individual rows List<FlowFile> splitFlowFiles = generateFlowFileSplits(context, session, flowFileRows, header.get(), headerPresent); // generate flow files from the individual rows List<FlowFile> detailsFlowFiles = generateFlowFilesRuleEngineDetails(context, session, flowFileDetails, header.get(), headerPresent); // transfer the original flow file session.transfer(flowFile, ORIGINAL); // transfer all individual rows to success relationship if (splitFlowFiles.size() > 0) { session.transfer(splitFlowFiles, SUCCESS); } // transfer all individual rows to success relationship if (detailsFlowFiles.size() > 0) { session.transfer(detailsFlowFiles, DETAILED_RESULTS); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoCustomTsvReader.java
/** * Iterate through lines and create span annotations accordingly. For multiple span annotation, * based on the position of the annotation in the line, update only the end position of the * annotation// w w w.j a va 2 s . c o m */ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:com.thalesgroup.hudson.plugins.cppcheck.CppcheckSource.java
/** * Splits the source code into three blocks: the line to highlight and the * source code before and after this line. * * @param sourceFile the source code of the whole file as rendered HTML string *//* w w w . j a v a2 s .co m*/ private void splitSourceFile(final String sourceFile) { StringBuilder output = new StringBuilder(sourceFile.length()); CppcheckFile cppcheckFile = cppcheckWorkspaceFile.getCppcheckFile(); LineIterator lineIterator = IOUtils.lineIterator(new StringReader(sourceFile)); int lineNumber = 1; //---header while (lineNumber < SOURCE_GENERATOR_OFFSET) { copyLine(output, lineIterator); lineNumber++; } lineNumber = 1; //---iterate before the error line while (lineNumber < cppcheckFile.getLineNumber()) { copyLine(output, lineIterator); lineNumber++; } output.append("</code>\n"); //---Error message output.append("</td></tr>\n"); output.append("<tr><td bgcolor=\""); appendRangeColor(output); output.append("\">\n"); output.append("<div tooltip=\""); outputEscaped(output, "<h3>"); outputEscaped(output, cppcheckFile.getCppCheckId()); output.append(": "); outputEscaped(output, cppcheckFile.getMessage()); outputEscaped(output, "</h3>"); if (cppcheckFile.getVerbose() != null) { outputEscaped(output, "<p style=\"white-space: pre-wrap;\">"); outputEscaped(output, cppcheckFile.getVerbose()); outputEscaped(output, "</p>"); } output.append("\" nodismiss=\"\">\n"); output.append("<code><b>\n"); //The current line error copyLine(output, lineIterator); lineNumber++; //End of the code output.append("</b></code>\n"); output.append("</div>\n"); output.append("</td></tr>\n"); output.append("<tr><td>\n"); output.append("<code>\n"); while (lineIterator.hasNext()) { copyLine(output, lineIterator); } output.append("</code>\n"); output.append("</td></tr>\n"); sourceCode = output.toString(); }
From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv2Reader.java
/** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation *//*from w w w. ja va 2 s .c om*/ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation // annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma // and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which // break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the // sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); }
From source file:com.edgenius.wiki.service.impl.BackupServiceImpl.java
/** * @param binderFile//from www. j a v a 2s .c o m */ private int versionCheck(File binderFile) { //it does not worth to use any XML technique to get version number... String verStr = null; LineIterator iter = null; try { for (iter = FileUtils.lineIterator(binderFile); iter.hasNext();) { String line = iter.nextLine(); if (verStr != null) { int eIdx = line.indexOf("</version>"); if (eIdx != -1) { verStr += line.substring(0, eIdx); break; } else { AuditLogger.error("Version in binder can not find close tag in next line, failed~"); } //I don't bear version tag in more than 2 lines! break; } int sIdx = line.indexOf("<version>"); if (sIdx != -1) { int eIdx = line.indexOf("</version>"); if (eIdx != -1) { verStr = line.substring(sIdx + "<version>".length(), eIdx); break; } else { verStr = line.substring(sIdx + "<version>".length()); AuditLogger.error("Version in binder even not insdie same line!?"); } } } } catch (IOException e) { log.error("Unable to read binder file to get version ", e); } finally { if (iter != null) iter.close(); } if (verStr == null || NumberUtils.toFloat(verStr.trim(), -1f) == -1f) { log.error("version parse failed."); return 0; } //upgrade binder file try { upgradeService.doBackupPackageUpgardeForBinder(verStr.trim(), binderFile); } catch (Exception e) { log.error("Unexpected erorr while upgrade backup binder file from " + verStr.trim() + " to " + Version.VERSION, e); } return (int) (NumberUtils.toFloat(verStr.trim(), 0) * 1000); }
From source file:de.tudarmstadt.ukp.clarin.webanno.api.dao.RepositoryServiceDbData.java
/** * Check if a TAB-Sep training file is in correct format before importing *//*from w w w . ja va2s. c om*/ private boolean isTabSepFileFormatCorrect(File aFile) { try { LineIterator it = new LineIterator(new FileReader(aFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().length() == 0) { continue; } if (line.split("\t").length != 2) { return false; } } } catch (Exception e) { return false; } return true; }
From source file:mitm.common.security.smime.SMIMEBuilderImplTest.java
@Test public void testEncryptBase64EncodeBug() throws Exception { MimeMessage message = new MimeMessage(MailSession.getDefaultSession()); message.setSubject("test"); message.setContent("test", "text/plain"); SMIMEBuilder builder = new SMIMEBuilderImpl(message, "to", "subject", "from"); X509Certificate certificate = TestUtils .loadCertificate("test/resources/testdata/certificates/certificate-base64-encode-bug.cer"); builder.addRecipient(certificate, SMIMERecipientMode.ISSUER_SERIAL); builder.encrypt(SMIMEEncryptionAlgorithm.DES_EDE3_CBC); MimeMessage newMessage = builder.buildMessage(); newMessage.saveChanges();// w w w . j a v a 2 s . c o m File file = new File(tempDir, "testEncryptBase64EncodeBug.eml"); FileOutputStream output = new FileOutputStream(file); MailUtils.writeMessage(newMessage, output); newMessage = MailUtils.loadMessage(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(); newMessage.writeTo(new SkipHeadersOutputStream(bos)); String blob = new String(bos.toByteArray(), "us-ascii"); // check if all lines are not longer than 76 characters LineIterator it = IOUtils.lineIterator(new StringReader(blob)); while (it.hasNext()) { String next = it.nextLine(); if (next.length() > 76) { fail("Line length exceeds 76: " + next); } } }
From source file:dk.netarkivet.archive.arcrepositoryadmin.ReplicaCacheDatabase.java
/** * Method for adding the results from a list of filenames on a replica. This list of filenames should return the * list of all the files within the database. * <p>/*from w w w .ja v a2 s. com*/ * For each file in the FileListJob the following fields are set for the corresponding entry in the replicafileinfo * table: <br/> * - filelist_status = ok. <br/> * - filelist_checkdatetime = now. * <p> * For each entry in the replicafileinfo table for the replica which are missing in the results from the FileListJob * the following fields are assigned the following values: <br/> * - filelist_status = missing. <br/> * - filelist_checkdatetime = now. * * @param filelistFile The list of filenames either parsed from a FilelistJob or the result from a * GetAllFilenamesMessage. * @param replica The replica, which the FilelistBatchjob has run upon. * @throws ArgumentNotValid If the filelist or the replica is null. * @throws UnknownID If the replica does not already exist in the database. */ @Override public void addFileListInformation(File filelistFile, Replica replica) throws ArgumentNotValid, UnknownID { ArgumentNotValid.checkNotNull(filelistFile, "File filelistFile"); ArgumentNotValid.checkNotNull(replica, "Replica replica"); // Sort the filelist file. File sortedResult = new File(filelistFile.getParent(), filelistFile.getName() + ".sorted"); FileUtils.sortFile(filelistFile, sortedResult); final long datasize = FileUtils.countLines(sortedResult); Connection con = ArchiveDBConnection.get(); Set<Long> missingReplicaRFIs = null; LineIterator lineIterator = null; try { // Make sure, that the replica exists in the database. if (!ReplicaCacheHelpers.existsReplicaInDB(replica, con)) { String errorMsg = "Cannot add filelist information, since the replica '" + replica.toString() + "' does not exist in the database."; log.warn(errorMsg); throw new UnknownID(errorMsg); } log.info("Starting processing of {} filelist entries for replica {}", datasize, replica.getId()); // retrieve the list of files already known by this cache. // TODO This does not scale! Should this datastructure // (missingReplicaRFIs) be disk-bound in some way. missingReplicaRFIs = ReplicaCacheHelpers.retrieveReplicaFileInfoGuidsForReplica(replica.getId(), con); // Initialize String iterator lineIterator = new LineIterator(new FileReader(sortedResult)); String lastFileName = ""; int i = 0; while (lineIterator.hasNext()) { String file = lineIterator.next(); // log that it is in progress every so often. if ((i % LOGGING_ENTRY_INTERVAL) == 0) { log.info("Processed file list entry number {} for replica {}", i, replica); // Close connection, and open another one // to avoid memory-leak (NAS-2003) ArchiveDBConnection.release(con); con = ArchiveDBConnection.get(); log.debug("Databaseconnection has now been renewed"); } ++i; // handle duplicates. if (file.equals(lastFileName)) { log.warn("There have been found multiple files with the name '{}'", file); continue; } lastFileName = file; // Add information for one file, and remove the ReplicaRFI from the // set of missing ones. missingReplicaRFIs.remove(ReplicaCacheHelpers.addFileInformation(file, replica, con)); } } catch (IOException e) { throw new IOFailure("Unable to read the filenames from file", e); } finally { ArchiveDBConnection.release(con); LineIterator.closeQuietly(lineIterator); } con = ArchiveDBConnection.get(); try { // go through the not found replicafileinfo for this replica to change // their filelist_status to missing. if (missingReplicaRFIs.size() > 0) { log.warn("Found {} missing files for replica '{}'.", missingReplicaRFIs.size(), replica); for (long rfi : missingReplicaRFIs) { // set the replicafileinfo in the database to missing. ReplicaCacheHelpers.updateReplicaFileInfoMissingFromFilelist(rfi, con); } } // Update the date for filelist update for this replica. ReplicaCacheHelpers.updateFilelistDateForReplica(replica, con); } finally { ArchiveDBConnection.release(con); } }
From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java
public void create_ngram_index(File ngram_joined_counts_file) throws IOException { File index_dir = new File(_index_dir, "ngram"); if (index_dir.exists()) { LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath()); if (_overwrite) { LOG.info("Overwriting index '{}',", index_dir); index_dir.delete();/*from w ww .ja v a2 s . com*/ } else return; } index_dir.mkdirs(); Analyzer analyzer = new KeywordAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); // use 80 percent of the available total memory double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6; double percentage_ram_buffer = Properties.ramBufferPercentage(); if (percentage_ram_buffer > 0) { double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer; LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)", percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb)); iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb); } Directory directory = new MMapDirectory(index_dir); IndexWriter writer_ngram = new IndexWriter(directory, iwc); InputStream in = new FileInputStream(ngram_joined_counts_file); if (ngram_joined_counts_file.getName().endsWith(".gz")) in = new GZIPInputStream(in); LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8"))); Document doc = new Document(); Field f_ngram = new StringField("ngram", "", Store.YES); doc.add(f_ngram); Field f_n = new IntField("cardinality", 0, Store.YES); doc.add(f_n); Field f_word = new StringField("word", "", Store.YES); doc.add(f_word); Field f_hist = new StringField("history", "", Store.YES); doc.add(f_hist); Field f_lower = new StringField("lower", "", Store.YES); doc.add(f_lower); Field f_count = new StoredField("num", 0L); doc.add(f_count); Field[] f_follow = new Field[4]; f_follow[0] = new StoredField("nf_s", 0L); doc.add(f_follow[0]); f_follow[1] = new StoredField("nf_N1", 0L); doc.add(f_follow[1]); f_follow[2] = new StoredField("nf_N2", 0L); doc.add(f_follow[2]); f_follow[3] = new StoredField("nf_N3", 0L); doc.add(f_follow[3]); Field[] f_precede = new Field[4]; f_precede[0] = new StoredField("np_s", 0L); doc.add(f_precede[0]); f_precede[1] = new StoredField("np_N1", 0L); doc.add(f_precede[1]); f_precede[2] = new StoredField("np_N2", 0L); doc.add(f_precede[2]); f_precede[3] = new StoredField("np_N3", 0L); doc.add(f_precede[3]); Field[] f_followerprecede = new Field[4]; f_followerprecede[0] = new StoredField("nfp_s", 0L); doc.add(f_followerprecede[0]); f_followerprecede[1] = new StoredField("nfp_N1", 0L); doc.add(f_followerprecede[1]); f_followerprecede[2] = new StoredField("nfp_N2", 0L); doc.add(f_followerprecede[2]); f_followerprecede[3] = new StoredField("nfp_N3", 0L); doc.add(f_followerprecede[3]); Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }; Long[] S = new Long[] { 0L }; long c = 0; while (iter.hasNext()) { if (++c % 100000 == 0) LOG.info("Adding {}'th ngram.", c); String line = iter.next(); try { String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t"); String ngram_str = splits[0]; if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) { LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line, ngram_joined_counts_file); continue; } List<String> ngram = Arrays.asList(ngram_str.split(" ")); long num = Long.parseLong(splits[1]); int n = ngram.size(); f_ngram.setStringValue(ngram_str); f_n.setIntValue(n); f_word.setStringValue(ngram.get(ngram.size() - 1)); f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " ")); f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " ")); f_count.setLongValue(num); for (int j = 0; j < f_follow.length; j++) { f_follow[j].setLongValue(0L); f_precede[j].setLongValue(0L); f_followerprecede[j].setLongValue(0L); } if (splits.length > 2 && !splits[2].isEmpty()) { // precede or follow or followerprecede String[] splits_ = splits[2].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 3 && !splits[3].isEmpty()) { // should be follow or followerprecede String[] splits_ = splits[3].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } if (splits.length > 4 && !splits[4].isEmpty()) { // should be followerprecede String[] splits_ = splits[4].split(":"); String type = splits_[0]; String[] count_values = splits_[1].split(","); if (count_values.length > 0) { if ("n_f".equals(type)) f_follow[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_p".equals(type)) f_precede[0].setLongValue(Long.parseLong(count_values[0])); else if ("n_fp".equals(type)) f_followerprecede[0].setLongValue(Long.parseLong(count_values[0])); } for (int i = 1; i < count_values.length; i++) { if ("n_f".equals(type)) f_follow[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_p".equals(type)) f_precede[i].setLongValue(Long.parseLong(count_values[i])); else if ("n_fp".equals(type)) f_followerprecede[i].setLongValue(Long.parseLong(count_values[i])); } } writer_ngram.addDocument(doc); while (N.length <= n) { N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } }); S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L }); } if (num == 1L) N[n][1]++; else if (num == 2L) N[n][2]++; else if (num == 3L) N[n][3]++; else if (num == 4L) N[n][4]++; else N[n][5]++; N[n][0]++; S[n] += num; } catch (Exception e) { LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line, ngram_joined_counts_file, c, e); } } writer_ngram.forceMergeDeletes(); writer_ngram.commit(); writer_ngram.close(); StringBuilder b = new StringBuilder(String.format( "#%n# Number of times where an ngram occurred: %n# at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n", N.length - 1)); for (int n = 1; n < N.length; n++) b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ','))); for (int n = 1; n < S.length; n++) b.append(String.format("s%d=%d%n", n, S[n])); FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString()); }