eu.project.ttc.engines.CasStatCounter.java Source code

Introduction

Here is the source code for eu.project.ttc.engines.CasStatCounter.java
Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.engines;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;

import org.apache.commons.lang.mutable.MutableInt;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;

import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;

/**
 * Compiles and logs CAS stats.
 * 
 * @author Damien Cram
 *
 */
public class CasStatCounter extends JCasAnnotator_ImplBase {
    private static final Logger LOGGER = LoggerFactory.getLogger(CasStatCounter.class);

    private Map<String, MutableInt> counters = Maps.newHashMap();

    public static final String STAT_NAME = "StatName";
    @ConfigurationParameter(name = STAT_NAME, mandatory = false)
    private String statName;

    public static final String DOCUMENT_PERIOD = "DocumentPeriod";
    @ConfigurationParameter(name = DOCUMENT_PERIOD, mandatory = false, defaultValue = "-1")
    private int docPeriod;
    private boolean periodicStatEnabled = false;
    private int docIt;
    private long cumulatedFileSize;

    public static final String TO_TRACE_FILE = "ToTraceFile";
    @ConfigurationParameter(name = TO_TRACE_FILE, mandatory = false)
    private String traceFileName;
    private Writer fileWriter;

    private static final String TSV_LINE_FORMAT = "%d\t%d\t%d\t%d\t%d\n";

    @ExternalResource(key = TermIndexResource.TERM_INDEX, mandatory = true)
    private TermIndexResource termIndexResource;

    private Stopwatch sw;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        this.sw = Stopwatch.createStarted();
        if (traceFileName != null) {
            File file = new File(traceFileName);
            try {
                this.fileWriter = new FileWriter(file);
            } catch (IOException e) {
                LOGGER.error("Could not create a writer to file {}", traceFileName);
                throw new ResourceInitializationException(e);
            }
            this.periodicStatEnabled = docPeriod > 0;
            LOGGER.info("Tracing time performance to file {}", file.getAbsolutePath());
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        this.docIt++;
        Optional<SourceDocumentInformation> sourceDocumentAnnotation = JCasUtils.getSourceDocumentAnnotation(aJCas);
        if (sourceDocumentAnnotation.isPresent())
            this.cumulatedFileSize += sourceDocumentAnnotation.get().getDocumentSize();
        FSIterator<Annotation> it = aJCas.getAnnotationIndex().iterator();
        Annotation a;
        MutableInt i;
        while (it.hasNext()) {
            a = it.next();
            i = counters.get(a.getType().getShortName());
            if (i == null)
                counters.put(a.getType().getShortName(), new MutableInt(1));
            else
                i.increment();
        }
        if (periodicStatEnabled && this.docIt % this.docPeriod == 0)
            try {
                traceToFile();
            } catch (IOException e) {
                throw new AnalysisEngineProcessException(e);
            }
    }

    private void traceToFile() throws IOException {
        String line = String.format(TSV_LINE_FORMAT, this.sw.elapsed(TimeUnit.MILLISECONDS), this.docIt,
                this.cumulatedFileSize, this.termIndexResource.getTermIndex().getTerms().size(),
                this.counters.get(WordAnnotation.class.getSimpleName()).intValue());
        LOGGER.debug(line);
        this.fileWriter.write(line);
        this.fileWriter.flush();
    }

    @Override
    protected void finalize() throws Throwable {
        this.fileWriter.close();
        super.finalize();
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        if (statName != null)
            logStats();
    }

    private void logStats() {
        Ordering<String> a = Ordering.natural().reverse().onResultOf(Functions.forMap(counters))
                .compound(Ordering.natural());
        Map<String, MutableInt> map = ImmutableSortedMap.copyOf(counters, a);

        Iterator<Entry<String, MutableInt>> it = map.entrySet().iterator();
        if (it.hasNext()) {// it will be empty if pipeline is run on empty collection
            Entry<String, MutableInt> mostFrequentAnno = it.next();
            LOGGER.info("[{}] {}: {} ", statName, mostFrequentAnno.getKey(),
                    mostFrequentAnno.getValue().intValue());
        }
        int nbSyntacticVariants = 0;
        int nbMorphologicalVariants = 0;
        int nbGraphicalVariants = 0;
        int nbOccurrences = 0;
        int nbPrimaryOccOccurrences = 0;
        TermIndex tIndex = termIndexResource.getTermIndex();
        for (Term t : tIndex.getTerms()) {
            nbMorphologicalVariants += Iterables.size(t.getVariations(VariationType.MORPHOLOGICAL));
            nbSyntacticVariants += Iterables.size(t.getVariations(VariationType.SYNTACTICAL));
            nbGraphicalVariants += Iterables.size(t.getVariations(VariationType.GRAPHICAL));
            nbOccurrences += t.getOccurrences().size();
            for (TermOccurrence o : t.getOccurrences()) {
                if (o.isPrimaryOccurrence())
                    nbPrimaryOccOccurrences++;
            }
        }
        // graphical variants are bidirectional
        nbGraphicalVariants /= 2;

        LOGGER.info("[{}] Nb terms:    {} [sw: {}, mw: {}]", statName, tIndex.getTerms().size(),
                Iterators.size(tIndex.singleWordTermIterator()), Iterators.size(tIndex.multiWordTermIterator()));
        LOGGER.info("[{}] Nb words:    {} [compounds: {}]", statName, tIndex.getWords().size(),
                Iterators.size(tIndex.compoundWordTermIterator()));
        LOGGER.info("[{}] Nb occurrences: {} [primary: {}]", statName, nbOccurrences, nbPrimaryOccOccurrences);
        LOGGER.info("[{}] Nb variants: {} [morph: {}, syn: {}, graph: {}]", statName,
                nbMorphologicalVariants + nbSyntacticVariants + nbGraphicalVariants, nbMorphologicalVariants,
                nbSyntacticVariants, nbGraphicalVariants);
    }
}