Example usage for org.apache.commons.io LineIterator next

Introduction

In this page you can find the example usage for org.apache.commons.io LineIterator next.

Prototype

public Object next()

Source Link

Document

Returns the next line in the wrapped Reader.

Usage

From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv1Reader.java

/**
 * Iterate through all lines and get available annotations<br>
 * First column is sentence number and a blank new line marks end of a sentence<br>
 * The Second column is the token <br>
 * The third column is the lemma annotation <br>
 * The fourth column is the POS annotation <br>
 * The fifth column is used for Named Entity annotations (Multiple annotations separeted by |
 * character) <br>/*from  ww w  .  j  a  v  a2 s  .  co  m*/
 * The sixth column is the origin token number of dependency parsing <br>
 * The seventh column is the function/type of the dependency parsing <br>
 * eighth and ninth columns are undefined currently
 */
private void setAnnotations(InputStream aIs, String aEncoding, StringBuilder text, Map<Integer, String> tokens,
        Map<Integer, String> pos, Map<Integer, String> lemma, Map<Integer, String> namedEntity,
        Map<Integer, String> dependencyFunction, Map<Integer, Integer> dependencyDependent,
        List<Integer> firstTokenInSentence) throws IOException {
    int tokenNumber = 0;
    boolean first = true;
    int base = 0;

    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    boolean textFound = false;
    StringBuffer tmpText = new StringBuffer();
    while (lineIterator.hasNext()) {
        String line = lineIterator.next().trim();
        if (line.startsWith("#text=")) {
            text.append(line.substring(6) + "\n");
            textFound = true;
            continue;
        }
        if (line.startsWith("#")) {
            continue;// it is a comment line
        }
        int count = StringUtils.countMatches(line, "\t");
        if (line.isEmpty()) {
            continue;
        }
        if (count != 9) {// not a proper TSV file
            getUimaContext().getLogger().log(Level.INFO, "This is not a valid TSV File");
            throw new IOException(fileName + " This is not a valid TSV File");
        }
        StringTokenizer lineTk = new StringTokenizer(line, "\t");

        if (first) {
            tokenNumber = Integer.parseInt(line.substring(0, line.indexOf("\t")));
            firstTokenInSentence.add(tokenNumber);
            first = false;
        } else {
            int lineNumber = Integer.parseInt(line.substring(0, line.indexOf("\t")));
            if (lineNumber == 1) {
                base = tokenNumber;
                firstTokenInSentence.add(base);
            }
            tokenNumber = base + Integer.parseInt(line.substring(0, line.indexOf("\t")));
        }

        while (lineTk.hasMoreElements()) {
            lineTk.nextToken();
            String token = lineTk.nextToken();

            // for backward compatibility
            tmpText.append(token + " ");

            tokens.put(tokenNumber, token);
            lemma.put(tokenNumber, lineTk.nextToken());
            pos.put(tokenNumber, lineTk.nextToken());
            String ne = lineTk.nextToken();
            lineTk.nextToken();// make it compatible with prev WebAnno TSV reader
            namedEntity.put(tokenNumber, (ne.equals("_") || ne.equals("-")) ? "O" : ne);
            String dependentValue = lineTk.nextToken();
            if (NumberUtils.isDigits(dependentValue)) {
                int dependent = Integer.parseInt(dependentValue);
                dependencyDependent.put(tokenNumber, dependent == 0 ? 0 : base + dependent);
                dependencyFunction.put(tokenNumber, lineTk.nextToken());
            } else {
                lineTk.nextToken();
            }
            lineTk.nextToken();
            lineTk.nextToken();
        }
    }
    if (!textFound) {
        text.append(tmpText);
    }
}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

private static void buildPredictFile(File apredFt, File aPredFile, List<List<String>> aPredictions,
        AnnotationFeature aFeature) throws IOException {
    LineIterator it = IOUtils.lineIterator(new FileReader(apredFt));
    StringBuffer predBuffer = new StringBuffer();
    int i = 0;/*www.j  a  v  a2  s.  co m*/
    while (it.hasNext()) {
        String line = it.next();
        if (line.trim().equals("")) {
            predBuffer.append("\n");
            continue;
        }
        StringTokenizer st = new StringTokenizer(line, " ");
        // if the target feature is on multiple token, we do not need the morphological features
        // in the prediction file
        if (aFeature.getLayer().isMultipleTokens()) {
            predBuffer.append(st.nextToken() + " ");
        } else {
            while (st.hasMoreTokens()) {
                predBuffer.append(st.nextToken() + " ");
            }
        }
        for (List<String> prediction : aPredictions) {
            predBuffer.append(prediction.get(i) + " ");
        }
        // add its
        predBuffer.append("\n");
        i++;
    }
    IOUtils.write(predBuffer.toString(), new FileOutputStream(aPredFile));

}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

private static void buildTrainFile(File aBaseFile, File aTrainFile, List<List<String>> aPredictions)
        throws IOException {
    LineIterator it = IOUtils.lineIterator(new FileReader(aBaseFile));
    StringBuffer trainBuffer = new StringBuffer();
    int i = 0;//from  w w w. j a v a  2  s  .c o m
    while (it.hasNext()) {
        String line = it.next();
        if (line.trim().equals("")) {
            trainBuffer.append("\n");
            continue;
        }
        StringTokenizer st = new StringTokenizer(line, " ");
        String label = "";
        String feature = "";
        // Except the last token, which is the label, maintain the line
        while (st.hasMoreTokens()) {
            feature = st.nextToken();
            if (label.equals("")) { // first time
                label = feature;
                continue;
            }
            trainBuffer.append(label + " ");
            label = feature;

        }
        for (List<String> prediction : aPredictions) {
            trainBuffer.append(prediction.get(i) + " ");
        }
        // add its own label
        trainBuffer.append(label + "\n");
        i++;
    }
    IOUtils.write(trainBuffer.toString(), new FileOutputStream(aTrainFile));

}

From source file:mitm.common.postfix.PostfixQueueParser.java

private void parse(String queue, LineHandler lineHandler) {
    Check.notNull(lineHandler, "lineHandler");

    StringReader reader = new StringReader(queue);

    try {/*  ww w  .j a v a 2 s.  c o  m*/
        LineIterator iterator = IOUtils.lineIterator(reader);

        /*
         * If the mail queue is empty the first line is "Mail queue is empty". If the mail queue is
         * not empty the first line should be the header. We should therefore skip the first line
         */
        if (iterator.hasNext()) {
            iterator.next();
        }

        while (iterator.hasNext()) {
            String line = iterator.nextLine();

            if (line.startsWith("--")) {
                /*
                 * The last line starts with -- so we are finished
                 */
                break;
            }

            /*
             * We need to collect all lines that belong to one queue item. Queue items use multiple lines
             * which are separated by an empty line
             */
            while (iterator.hasNext()) {
                String otherLine = iterator.nextLine();

                if (otherLine.length() == 0) {
                    break;
                }

                line = line + " " + otherLine;
            }

            boolean match = true;

            if (searchPattern != null) {
                Matcher matcher = searchPattern.matcher(line);

                if (!matcher.find()) {
                    match = false;
                }
            }

            if (match && !lineHandler.lineFound(line)) {
                break;
            }
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

private static void getFeatureOtherLayer(MiraTemplate aTemplate, RepositoryService aRepository,
        AnnotationService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao,
        int beamSize, boolean maxPosteriors, List<List<String>> predictions, Mira mira, File predFtFile,
        File predcitedFile, SourceDocument document)
        throws FileNotFoundException, IOException, ClassNotFoundException, UIMAException {
    // other layers as training document
    for (AnnotationFeature feature : aTemplate.getOtherFeatures()) {
        int shiftColumns = 0;
        int nbest = 1;
        String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath();
        if (!new File(modelName).exists()) {
            addOtherFeatureFromAnnotation(feature, aRepository, aAnnotationService, aUserDao, predictions,
                    document);/*w w w.  ja  v a 2s  .  com*/
            continue;
        }
        String testName = predFtFile.getAbsolutePath();

        PrintStream stream = new PrintStream(predcitedFile);
        BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
        if (testName != null) {
            input = new BufferedReader(new FileReader(testName));
        }
        mira.loadModel(modelName);
        mira.setShiftColumns(shiftColumns);
        mira.nbest = nbest;
        mira.beamSize = beamSize;
        mira.maxPosteriors = maxPosteriors;
        mira.test(input, stream);

        LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
        List<String> annotations = new ArrayList<String>();

        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().equals("")) {
                continue;
            }
            StringTokenizer st = new StringTokenizer(line, " ");
            String tag = "";
            while (st.hasMoreTokens()) {
                tag = st.nextToken();
            }
            annotations.add(tag);
        }
        predictions.add(annotations);
    }
}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

private static void getFeaturesTabSep(MiraTemplate aTemplate, RepositoryService aRepository,
        AutomationService aAutomationService, int beamSize, boolean maxPosteriors,
        AnnotationFeature layerFeature, List<List<String>> predictions, Mira mira, File predFile,
        File predcitedFile)//from  w  w w  .ja v a  2  s.  c om
        throws FileNotFoundException, IOException, ClassNotFoundException, AutomationException {
    for (SourceDocument document : aAutomationService
            .listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        int shiftColumns = 0;
        int nbest = 1;
        String modelName = aAutomationService.getMiraModel(layerFeature, true, document).getAbsolutePath();
        if (!new File(modelName).exists()) {
            continue;
        }
        String testName = predFile.getAbsolutePath();

        PrintStream stream = new PrintStream(predcitedFile);
        BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
        if (testName != null) {
            input = new BufferedReader(new FileReader(testName));
        }
        mira.loadModel(modelName);
        mira.setShiftColumns(shiftColumns);
        mira.nbest = nbest;
        mira.beamSize = beamSize;
        mira.maxPosteriors = maxPosteriors;
        try {
            mira.test(input, stream);
        } catch (Exception e) {
            throw new AutomationException(document.getName() + " is Invalid TAB-SEP file!");
        }

        LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
        List<String> annotations = new ArrayList<String>();

        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().equals("")) {
                continue;
            }
            StringTokenizer st = new StringTokenizer(line, " ");
            String tag = "";
            while (st.hasMoreTokens()) {
                tag = st.nextToken();
            }
            annotations.add(tag);
        }
        predictions.add(annotations);
    }
}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

public static void generateTrainDocument(MiraTemplate aTemplate, RepositoryService aRepository,
        AnnotationService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao,
        boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException {
    File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature());
    if (!miraDir.exists()) {
        FileUtils.forceMkdir(miraDir);//  w  ww  .  ja v a 2  s .c o  m
    }

    String username = SecurityContextHolder.getContext().getAuthentication().getName();
    User user = aUserDao.get(username);
    AnnotationFeature feature = aTemplate.getTrainFeature();
    boolean documentChanged = false;
    // A. training document for other train layers were changed
    for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) {
        for (SourceDocument document : aRepository
                .listSourceDocuments(aTemplate.getTrainFeature().getProject())) {
            if (!document.isProcessed() && document.getFeature() != null
                    && document.getFeature().equals(otherrFeature)) {
                documentChanged = true;
                break;
            }
        }
    }
    // B. Training document for the main training layer were changed
    for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
        if (!document.isProcessed()
                && (document.getFeature() != null && document.getFeature().equals(feature))) {
            documentChanged = true;
            break;
        }
    }
    // C. New Curation document arrives
    for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) {
        if (!document.isProcessed() && document.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            documentChanged = true;
            break;
        }
    }
    // D. tab-sep training documents
    for (SourceDocument document : aAutomationService
            .listTabSepDocuments(aTemplate.getTrainFeature().getProject())) {
        if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) {
            documentChanged = true;
            break;
        }
    }
    if (!documentChanged) {
        return;
    }
    File trainFile;
    if (aBase) {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft");
    } else {
        trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base");
    }

    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);

    BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile));
    AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter(aAnnotationService,
            feature.getLayer());
    // Training documents (Curated or webanno-compatible imported ones - read using UIMA)
    for (SourceDocument sourceDocument : aRepository.listSourceDocuments(feature.getProject())) {
        if ((sourceDocument.isTrainingDocument() && sourceDocument.getFeature() != null
                && sourceDocument.getFeature().equals(feature))) {
            JCas jCas = aRepository.readAnnotationCas(sourceDocument, user);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {// base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n");
                } else {// training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n");
                }
            }
            sourceDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        } else if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) {
            JCas jCas = aRepository.readCurationCas(sourceDocument);
            for (Sentence sentence : select(jCas, Sentence.class)) {
                if (aBase) {// base training document
                    trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n");
                } else {// training document with other features
                    trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n");
                }
            }
            sourceDocument.setProcessed(!aBase);
            if (!aBase) {
                status.setTrainDocs(status.getTrainDocs() - 1);
            }
        }
    }
    // Tab-sep documents to be used as a target layer train document
    for (SourceDocument document : aAutomationService.listTabSepDocuments(feature.getProject())) {
        if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null
                && document.getFeature().equals(feature)) {
            File tabSepFile = new File(aRepository.getDocumentFolder(document), document.getName());
            LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile));
            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    trainOut.append("\n");
                } else {
                    StringTokenizer st = new StringTokenizer(line, "\t");
                    if (st.countTokens() != 2) {
                        trainOut.close();
                        throw new AutomationException("This is not a valid TAB-SEP document");
                    }
                    if (aBase) {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), ""));
                    } else {
                        trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken()));
                    }
                }
            }
        }
    }
    trainOut.close();
}

From source file:de.tudarmstadt.ukp.clarin.webanno.automation.util.AutomationUtil.java

public static void predict(MiraTemplate aTemplate, RepositoryService aRepository,
        AutomationService aAutomationService, UserDao aUserDao)
        throws CASException, UIMAException, ClassNotFoundException, IOException, BratAnnotationException {
    AnnotationFeature layerFeature = aTemplate.getTrainFeature();

    File miraDir = aAutomationService.getMiraDir(layerFeature);
    AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate);
    for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) {
        if (!document.isProcessed() && !document.isTrainingDocument()) {
            File predFile = new File(miraDir, document.getId() + ".pred");
            Mira mira = new Mira();
            int shiftColumns = 0;
            int nbest = 1;
            int beamSize = 0;
            boolean maxPosteriors = false;
            String modelName = aAutomationService.getMiraModel(layerFeature, false, null).getAbsolutePath();
            String testName = predFile.getAbsolutePath();
            File predcitedFile = new File(predFile.getAbsolutePath() + "-pred");
            PrintStream stream = new PrintStream(predcitedFile);
            BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
            if (testName != null) {
                input = new BufferedReader(new FileReader(testName));
            }//from w w  w .j  a va  2  s . co m
            mira.loadModel(modelName);
            mira.setShiftColumns(shiftColumns);
            mira.nbest = nbest;
            mira.beamSize = beamSize;
            mira.maxPosteriors = maxPosteriors;
            mira.test(input, stream);

            LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS");
            LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile));
            List<String> annotations = new ArrayList<String>();

            while (it.hasNext()) {
                String line = it.next();
                if (line.trim().equals("")) {
                    continue;
                }
                StringTokenizer st = new StringTokenizer(line, " ");
                String tag = "";
                while (st.hasMoreTokens()) {
                    tag = st.nextToken();

                }
                annotations.add(tag);
            }

            LOG.info(annotations.size() + " Predictions found to be written to the CAS");
            JCas jCas = null;
            String username = SecurityContextHolder.getContext().getAuthentication().getName();
            User user = aUserDao.get(username);
            try {
                AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user);
                jCas = aRepository.readAnnotationCas(annoDocument);
            } catch (DataRetrievalFailureException e) {

            }
            automate(jCas, layerFeature, annotations);
            LOG.info("Predictions found are written to the CAS");
            aRepository.writeCorrectionCas(jCas, document, user);
            document.setProcessed(true);
            status.setAnnoDocs(status.getAnnoDocs() - 1);
        }
    }
}

From source file:dk.netarkivet.harvester.indexserver.distribute.TestIndexRequestServer.java

private Set<Long> readLongsFromFile(File fileWithLongs) {
    Set<Long> resultSet = new HashSet<Long>();
    try {//from ww  w .  j  a va2 s .c  o  m
        LineIterator lineIterator = new LineIterator(new FileReader(fileWithLongs));
        while (lineIterator.hasNext()) {
            String line = lineIterator.next();
            resultSet.add(Long.parseLong(line));
        }
    } catch (IOException e) {
        log.error("Unable to read from file '{}'. Returns set of size {}", fileWithLongs.getAbsolutePath(),
                resultSet.size());
    }

    return resultSet;
}

From source file:dk.netarkivet.harvester.datamodel.PartialHarvest.java

/**
 * This method is a duplicate of the addSeeds method but for seedsFile parameter
 *
 * @param seedsFile a newline-separated File containing the seeds to be added
 * @param templateName the name of the template to be used
 * @param maxBytes Maximum number of bytes to harvest per domain
 * @param maxObjects Maximum number of objects to harvest per domain
 *///from  w  w  w  .  ja v a  2 s  . c  o  m
public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects,
        Map<String, String> attributeValues) {
    ArgumentNotValid.checkNotNull(seedsFile, "seeds");
    ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist");
    ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName");
    if (!TemplateDAO.getInstance().exists(templateName)) {
        throw new UnknownID("No such template: " + templateName);
    }

    Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>();
    StringBuilder invalidMessage = new StringBuilder(
            "Unable to create an event harvest.\n" + "The following seeds are invalid:\n");
    boolean valid = true;

    // validate all the seeds in the file
    // those accepted are entered into the acceptedSeeds datastructure

    // Iterate through the contents of the file
    LineIterator seedIterator = null;
    try {
        seedIterator = new LineIterator(new FileReader(seedsFile));
        while (seedIterator.hasNext()) {
            String seed = seedIterator.next();
            boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds);
            if (!seedValid) {
                valid = false;
            }
        }
    } catch (IOException e) {
        throw new IOFailure("Unable to process seedsfile ", e);
    } finally {
        LineIterator.closeQuietly(seedIterator);
    }

    if (!valid) {
        throw new ArgumentNotValid(invalidMessage.toString());
    }

    addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues);
}