List of usage examples for org.apache.commons.io LineIterator next
public Object next()
Reader
. From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv1Reader.java
/** * Iterate through all lines and get available annotations<br> * First column is sentence number and a blank new line marks end of a sentence<br> * The Second column is the token <br> * The third column is the lemma annotation <br> * The fourth column is the POS annotation <br> * The fifth column is used for Named Entity annotations (Multiple annotations separeted by | * character) <br>/*from ww w . j a v a2 s . co m*/ * The sixth column is the origin token number of dependency parsing <br> * The seventh column is the function/type of the dependency parsing <br> * eighth and ninth columns are undefined currently */ private void setAnnotations(InputStream aIs, String aEncoding, StringBuilder text, Map<Integer, String> tokens, Map<Integer, String> pos, Map<Integer, String> lemma, Map<Integer, String> namedEntity, Map<Integer, String> dependencyFunction, Map<Integer, Integer> dependencyDependent, List<Integer> firstTokenInSentence) throws IOException { int tokenNumber = 0; boolean first = true; int base = 0; LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); boolean textFound = false; StringBuffer tmpText = new StringBuffer(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.startsWith("#text=")) { text.append(line.substring(6) + "\n"); textFound = true; continue; } if (line.startsWith("#")) { continue;// it is a comment line } int count = StringUtils.countMatches(line, "\t"); if (line.isEmpty()) { continue; } if (count != 9) {// not a proper TSV file getUimaContext().getLogger().log(Level.INFO, "This is not a valid TSV File"); throw new IOException(fileName + " This is not a valid TSV File"); } StringTokenizer lineTk = new StringTokenizer(line, "\t"); if (first) { tokenNumber = Integer.parseInt(line.substring(0, line.indexOf("\t"))); firstTokenInSentence.add(tokenNumber); first = false; } else { int lineNumber = Integer.parseInt(line.substring(0, line.indexOf("\t"))); if (lineNumber == 1) { base = tokenNumber; firstTokenInSentence.add(base); } tokenNumber = base + Integer.parseInt(line.substring(0, line.indexOf("\t"))); } while (lineTk.hasMoreElements()) { lineTk.nextToken(); String token = lineTk.nextToken(); // for backward compatibility tmpText.append(token + " "); tokens.put(tokenNumber, token); lemma.put(tokenNumber, lineTk.nextToken()); pos.put(tokenNumber, lineTk.nextToken()); String ne = lineTk.nextToken(); lineTk.nextToken();// make it compatible with prev WebAnno TSV reader namedEntity.put(tokenNumber, (ne.equals("_") || ne.equals("-")) ? "O" : ne); String dependentValue = lineTk.nextToken(); if (NumberUtils.isDigits(dependentValue)) { int dependent = Integer.parseInt(dependentValue); dependencyDependent.put(tokenNumber, dependent == 0 ? 0 : base + dependent); dependencyFunction.put(tokenNumber, lineTk.nextToken()); } else { lineTk.nextToken(); } lineTk.nextToken(); lineTk.nextToken(); } } if (!textFound) { text.append(tmpText); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
private static void buildPredictFile(File apredFt, File aPredFile, List<List<String>> aPredictions, AnnotationFeature aFeature) throws IOException { LineIterator it = IOUtils.lineIterator(new FileReader(apredFt)); StringBuffer predBuffer = new StringBuffer(); int i = 0;/*www.j a v a2 s. co m*/ while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { predBuffer.append("\n"); continue; } StringTokenizer st = new StringTokenizer(line, " "); // if the target feature is on multiple token, we do not need the morphological features // in the prediction file if (aFeature.getLayer().isMultipleTokens()) { predBuffer.append(st.nextToken() + " "); } else { while (st.hasMoreTokens()) { predBuffer.append(st.nextToken() + " "); } } for (List<String> prediction : aPredictions) { predBuffer.append(prediction.get(i) + " "); } // add its predBuffer.append("\n"); i++; } IOUtils.write(predBuffer.toString(), new FileOutputStream(aPredFile)); }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
private static void buildTrainFile(File aBaseFile, File aTrainFile, List<List<String>> aPredictions) throws IOException { LineIterator it = IOUtils.lineIterator(new FileReader(aBaseFile)); StringBuffer trainBuffer = new StringBuffer(); int i = 0;//from w w w. j a v a 2 s .c o m while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { trainBuffer.append("\n"); continue; } StringTokenizer st = new StringTokenizer(line, " "); String label = ""; String feature = ""; // Except the last token, which is the label, maintain the line while (st.hasMoreTokens()) { feature = st.nextToken(); if (label.equals("")) { // first time label = feature; continue; } trainBuffer.append(label + " "); label = feature; } for (List<String> prediction : aPredictions) { trainBuffer.append(prediction.get(i) + " "); } // add its own label trainBuffer.append(label + "\n"); i++; } IOUtils.write(trainBuffer.toString(), new FileOutputStream(aTrainFile)); }
From source file:mitm.common.postfix.PostfixQueueParser.java
private void parse(String queue, LineHandler lineHandler) { Check.notNull(lineHandler, "lineHandler"); StringReader reader = new StringReader(queue); try {/* ww w .j a v a 2 s. c o m*/ LineIterator iterator = IOUtils.lineIterator(reader); /* * If the mail queue is empty the first line is "Mail queue is empty". If the mail queue is * not empty the first line should be the header. We should therefore skip the first line */ if (iterator.hasNext()) { iterator.next(); } while (iterator.hasNext()) { String line = iterator.nextLine(); if (line.startsWith("--")) { /* * The last line starts with -- so we are finished */ break; } /* * We need to collect all lines that belong to one queue item. Queue items use multiple lines * which are separated by an empty line */ while (iterator.hasNext()) { String otherLine = iterator.nextLine(); if (otherLine.length() == 0) { break; } line = line + " " + otherLine; } boolean match = true; if (searchPattern != null) { Matcher matcher = searchPattern.matcher(line); if (!matcher.find()) { match = false; } } if (match && !lineHandler.lineFound(line)) { break; } } } finally { IOUtils.closeQuietly(reader); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
private static void getFeatureOtherLayer(MiraTemplate aTemplate, RepositoryService aRepository, AnnotationService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, int beamSize, boolean maxPosteriors, List<List<String>> predictions, Mira mira, File predFtFile, File predcitedFile, SourceDocument document) throws FileNotFoundException, IOException, ClassNotFoundException, UIMAException { // other layers as training document for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { int shiftColumns = 0; int nbest = 1; String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath(); if (!new File(modelName).exists()) { addOtherFeatureFromAnnotation(feature, aRepository, aAnnotationService, aUserDao, predictions, document);/*w w w. ja v a 2s . com*/ continue; } String testName = predFtFile.getAbsolutePath(); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); } mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; mira.test(input, stream); LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } predictions.add(annotations); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
private static void getFeaturesTabSep(MiraTemplate aTemplate, RepositoryService aRepository, AutomationService aAutomationService, int beamSize, boolean maxPosteriors, AnnotationFeature layerFeature, List<List<String>> predictions, Mira mira, File predFile, File predcitedFile)//from w w w .ja v a 2 s. c om throws FileNotFoundException, IOException, ClassNotFoundException, AutomationException { for (SourceDocument document : aAutomationService .listTabSepDocuments(aTemplate.getTrainFeature().getProject())) { int shiftColumns = 0; int nbest = 1; String modelName = aAutomationService.getMiraModel(layerFeature, true, document).getAbsolutePath(); if (!new File(modelName).exists()) { continue; } String testName = predFile.getAbsolutePath(); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); } mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; try { mira.test(input, stream); } catch (Exception e) { throw new AutomationException(document.getName() + " is Invalid TAB-SEP file!"); } LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } predictions.add(annotations); } }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
public static void generateTrainDocument(MiraTemplate aTemplate, RepositoryService aRepository, AnnotationService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException { File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); if (!miraDir.exists()) { FileUtils.forceMkdir(miraDir);// w ww . ja v a 2 s .c o m } String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); AnnotationFeature feature = aTemplate.getTrainFeature(); boolean documentChanged = false; // A. training document for other train layers were changed for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) { for (SourceDocument document : aRepository .listSourceDocuments(aTemplate.getTrainFeature().getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) { documentChanged = true; break; } } } // B. Training document for the main training layer were changed for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) { documentChanged = true; break; } } // C. New Curation document arrives for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && document.getState().equals(SourceDocumentState.CURATION_FINISHED)) { documentChanged = true; break; } } // D. tab-sep training documents for (SourceDocument document : aAutomationService .listTabSepDocuments(aTemplate.getTrainFeature().getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) { documentChanged = true; break; } } if (!documentChanged) { return; } File trainFile; if (aBase) { trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft"); } else { trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base"); } AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile)); AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter(aAnnotationService, feature.getLayer()); // Training documents (Curated or webanno-compatible imported ones - read using UIMA) for (SourceDocument sourceDocument : aRepository.listSourceDocuments(feature.getProject())) { if ((sourceDocument.isTrainingDocument() && sourceDocument.getFeature() != null && sourceDocument.getFeature().equals(feature))) { JCas jCas = aRepository.readAnnotationCas(sourceDocument, user); for (Sentence sentence : select(jCas, Sentence.class)) { if (aBase) {// base training document trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n"); } else {// training document with other features trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n"); } } sourceDocument.setProcessed(!aBase); if (!aBase) { status.setTrainDocs(status.getTrainDocs() - 1); } } else if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) { JCas jCas = aRepository.readCurationCas(sourceDocument); for (Sentence sentence : select(jCas, Sentence.class)) { if (aBase) {// base training document trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n"); } else {// training document with other features trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n"); } } sourceDocument.setProcessed(!aBase); if (!aBase) { status.setTrainDocs(status.getTrainDocs() - 1); } } } // Tab-sep documents to be used as a target layer train document for (SourceDocument document : aAutomationService.listTabSepDocuments(feature.getProject())) { if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) { File tabSepFile = new File(aRepository.getDocumentFolder(document), document.getName()); LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { trainOut.append("\n"); } else { StringTokenizer st = new StringTokenizer(line, "\t"); if (st.countTokens() != 2) { trainOut.close(); throw new AutomationException("This is not a valid TAB-SEP document"); } if (aBase) { trainOut.append(getMiraLineForTabSep(st.nextToken(), "")); } else { trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken())); } } } } } trainOut.close(); }
From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java
public static void predict(MiraTemplate aTemplate, RepositoryService aRepository, AutomationService aAutomationService, UserDao aUserDao) throws CASException, UIMAException, ClassNotFoundException, IOException, BratAnnotationException { AnnotationFeature layerFeature = aTemplate.getTrainFeature(); File miraDir = aAutomationService.getMiraDir(layerFeature); AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { if (!document.isProcessed() && !document.isTrainingDocument()) { File predFile = new File(miraDir, document.getId() + ".pred"); Mira mira = new Mira(); int shiftColumns = 0; int nbest = 1; int beamSize = 0; boolean maxPosteriors = false; String modelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath(); String testName = predFile.getAbsolutePath(); File predcitedFile = new File(predFile.getAbsolutePath() + "-pred"); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); }//from w w w .j a va 2 s . co m mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; mira.test(input, stream); LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS"); LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } LOG.info(annotations.size() + " Predictions found to be written to the CAS"); JCas jCas = null; String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); try { AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user); jCas = aRepository.readAnnotationCas(annoDocument); } catch (DataRetrievalFailureException e) { } automate(jCas, layerFeature, annotations); LOG.info("Predictions found are written to the CAS"); aRepository.writeCorrectionCas(jCas, document, user); document.setProcessed(true); status.setAnnoDocs(status.getAnnoDocs() - 1); } } }
From source file:dk.netarkivet.harvester.indexserver.distribute.TestIndexRequestServer.java
private Set<Long> readLongsFromFile(File fileWithLongs) { Set<Long> resultSet = new HashSet<Long>(); try {//from ww w . j a va2 s .c o m LineIterator lineIterator = new LineIterator(new FileReader(fileWithLongs)); while (lineIterator.hasNext()) { String line = lineIterator.next(); resultSet.add(Long.parseLong(line)); } } catch (IOException e) { log.error("Unable to read from file '{}'. Returns set of size {}", fileWithLongs.getAbsolutePath(), resultSet.size()); } return resultSet; }
From source file:dk.netarkivet.harvester.datamodel.PartialHarvest.java
/** * This method is a duplicate of the addSeeds method but for seedsFile parameter * * @param seedsFile a newline-separated File containing the seeds to be added * @param templateName the name of the template to be used * @param maxBytes Maximum number of bytes to harvest per domain * @param maxObjects Maximum number of objects to harvest per domain *///from w w w . ja v a 2 s . c o m public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { ArgumentNotValid.checkNotNull(seedsFile, "seeds"); ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); if (!TemplateDAO.getInstance().exists(templateName)) { throw new UnknownID("No such template: " + templateName); } Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); StringBuilder invalidMessage = new StringBuilder( "Unable to create an event harvest.\n" + "The following seeds are invalid:\n"); boolean valid = true; // validate all the seeds in the file // those accepted are entered into the acceptedSeeds datastructure // Iterate through the contents of the file LineIterator seedIterator = null; try { seedIterator = new LineIterator(new FileReader(seedsFile)); while (seedIterator.hasNext()) { String seed = seedIterator.next(); boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); if (!seedValid) { valid = false; } } } catch (IOException e) { throw new IOFailure("Unable to process seedsfile ", e); } finally { LineIterator.closeQuietly(seedIterator); } if (!valid) { throw new ArgumentNotValid(invalidMessage.toString()); } addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); }