eu.project.ttc.engines.SyntacticTermGatherer.java Source code

Java tutorial

Introduction

Here is the source code for eu.project.ttc.engines.SyntacticTermGatherer.java

Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright 2, 2015nership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package eu.project.ttc.engines;

import java.math.BigInteger;
import java.util.List;
import java.util.ListIterator;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Lists;

import eu.project.ttc.engines.variant.VariantRule;
import eu.project.ttc.engines.variant.VariantRuleIndex;
import eu.project.ttc.history.TermHistoryResource;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.models.index.CustomIndexStats;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.resources.ObserverResource;
import eu.project.ttc.resources.ObserverResource.SubTaskObserver;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.resources.YamlVariantRules;

public class SyntacticTermGatherer extends JCasAnnotator_ImplBase {
    private static final Logger LOGGER = LoggerFactory.getLogger(SyntacticTermGatherer.class);
    public static final String TASK_NAME = "Syntactic variant gathering";
    private static final int OBSERVING_STEP = 1000;
    private static final int WARNING_CRITICAL_SIZE = 2500;
    private static final String M_PREFIX = "M";

    @ExternalResource(key = ObserverResource.OBSERVER, mandatory = false)
    protected ObserverResource observerResource;

    @ExternalResource(key = TermIndexResource.TERM_INDEX, mandatory = true)
    private TermIndexResource termIndexResource;

    public static final String YAML_VARIANT_RULES = "YamlVariantRules";
    @ExternalResource(key = YAML_VARIANT_RULES, mandatory = true)
    private YamlVariantRules yamlVariantRules;

    @ExternalResource(key = TermHistoryResource.TERM_HISTORY, mandatory = true)
    private TermHistoryResource historyResource;

    private BigInteger totalComparisons = BigInteger.valueOf(0);
    private int nbComparisons = 0;
    private Optional<SubTaskObserver> taskObserver = Optional.absent();

    static class RunConfig {
        String indexName;
        VariantRuleIndex variantRuleIndex;

        RunConfig(String indexName, VariantRuleIndex variantRuleIndex) {
            super();
            this.indexName = indexName;
            this.variantRuleIndex = variantRuleIndex;
        }
    }

    /*
     *  Do not deactivate gathering on key_lemma_lemma, otherwise we loose
     *  morphological gathering based on single-word (with [compound] tag in yaml).
     *  TODO : understanding why
     */
    private static final RunConfig[] RUN_CONFIGS = new RunConfig[] {
            new RunConfig(TermIndexes.WORD_COUPLE_LEMMA_LEMMA, VariantRuleIndex.DEFAULT),
            new RunConfig(TermIndexes.WORD_COUPLE_LEMMA_STEM, VariantRuleIndex.DEFAULT),
            new RunConfig(TermIndexes.TERM_HAS_PREFIX_LEMMA, VariantRuleIndex.PREFIX),
            new RunConfig(TermIndexes.TERM_HAS_DERIVATES_LEMMA, VariantRuleIndex.DERIVATION) };

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        this.yamlVariantRules.initialize(this.termIndexResource.getTermIndex());
        if (observerResource != null)
            taskObserver = Optional.of(observerResource.getTaskObserver(TASK_NAME));
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        LOGGER.info("Starting syntactic term gathering for TermIndex {}",
                this.termIndexResource.getTermIndex().getName());

        TermIndex termIndex = this.termIndexResource.getTermIndex();

        if (termIndexResource.getTermIndex().getTerms().isEmpty())
            return;

        /*
         * Prepare observer and indexes
         */
        for (RunConfig runConfig : RUN_CONFIGS) {
            CustomTermIndex customIndex = termIndex.getCustomIndex(runConfig.indexName);
            customIndex.cleanSingletonKeys();

            // clean biggest classes
            customIndex.cleanEntriesByMaxSize(WARNING_CRITICAL_SIZE);

            CustomIndexStats stats = new CustomIndexStats(customIndex);

            // Display class sizes
            Stopwatch sw1 = Stopwatch.createStarted();
            int k = 0;
            LOGGER.debug("Biggest class is {}, size: {}", stats.getBiggestClass(), stats.getBiggestSize());

            int size;
            for (Integer i : stats.getSizeCounters().keySet()) {
                k++;
                size = stats.getSizeCounters().get(i).size();
                totalComparisons = totalComparisons.add(BigInteger.valueOf(size * i * (i - 1)));
            }
            LOGGER.debug("Number of term pairs to test: " + totalComparisons);
            sw1.stop();
            LOGGER.debug("Time to get the comparisons number: " + sw1.elapsed(TimeUnit.MILLISECONDS));
            LOGGER.debug("Number of classes: " + k);
            if (taskObserver.isPresent())
                taskObserver.get().setTotalTaskWork(totalComparisons.longValue());
        }

        LOGGER.debug("Gathering with default variant rule indexing (source and target patterns)");
        for (RunConfig runConfig : RUN_CONFIGS) {
            gather(runConfig.indexName, runConfig.variantRuleIndex);
            termIndex.dropCustomIndex(runConfig.indexName);
        }

    }

    private void gather(final String gatheringKey, VariantRuleIndex variantRuleIndex) {
        LOGGER.debug("Rule-based gathering over the pregathering key {}", gatheringKey);

        // create the index
        CustomTermIndex customIndex = this.termIndexResource.getTermIndex().getCustomIndex(gatheringKey);
        LOGGER.debug("Rule-based gathering over {} classes", customIndex.size());

        // Log the progress every 5 seconds
        Timer progressLoggerTimer = new Timer("Syn. Variant Gathering Timer");
        progressLoggerTimer.schedule(new TimerTask() {
            @Override
            public void run() {
                SyntacticTermGatherer.LOGGER.info("progress for key {}: ({}%)", gatheringKey,
                        String.format("%.2f", ((float) nbComparisons * 100) / totalComparisons.longValue()));
            }
        }, 5000l, 5000l);

        // Do the gathering in each class
        nbComparisons = 0;
        for (String cls : customIndex.keySet()) {
            List<Term> list = customIndex.getTerms(cls);
            List<String> examples = Lists.newLinkedList();
            int cnt = 0;
            for (Term t : list) {
                examples.add(t.getGroupingKey());
                cnt++;
                if (cnt > 5)
                    break;
            }
            if (list.size() > 1 && LOGGER.isTraceEnabled())
                LOGGER.trace("Rule-based gathering over the '" + cls + "' term class of size " + list.size() + ": "
                        + Joiner.on(" ").join(examples));

            Term source;
            Term target;
            for (ListIterator<Term> sourceIt = list.listIterator(); sourceIt.hasNext();) {
                source = sourceIt.next();
                for (ListIterator<Term> targetIt = list.listIterator(sourceIt.nextIndex()); targetIt.hasNext();) {
                    nbComparisons += 2;
                    target = targetIt.next();
                    applyGatheringRules(variantRuleIndex, source, target);
                    applyGatheringRules(variantRuleIndex, target, source);
                    if (nbComparisons % OBSERVING_STEP == 0)
                        if (taskObserver.isPresent())
                            taskObserver.get().work(OBSERVING_STEP);
                }

            }
        }

        //finalize
        progressLoggerTimer.cancel();
    }

    private void applyGatheringRules(VariantRuleIndex variantRuleIndex, Term source, Term target) {
        VariantRule matchingRule = yamlVariantRules.getMatchingRule(variantRuleIndex, source, target);
        if (matchingRule != null) {
            applyMatchingRule(matchingRule, source, target);
        }
    }

    private void applyMatchingRule(VariantRule matchingRule, Term source, Term target) {
        // Finds the most frequent of both terms
        checkFrequency(source);
        checkFrequency(target);

        //      /*
        //       * Reverse the variation when target frequency is
        //       * bigger than source frequency
        //       */
        //      if(baseTargetComparator.compare(source, target) > 0) {
        //         // swaps terms, sets the most frequent  and shortest as the source
        //         Term aux = source;
        //         source = target;
        //         target = aux;
        //      }

        TermVariation tv = source.addTermVariation(target,
                matchingRule.getName().startsWith(M_PREFIX) ? VariationType.MORPHOLOGICAL
                        : VariationType.SYNTACTICAL,
                matchingRule.getName());

        watch(source, target, tv);
    }

    private void watch(Term source, Term target, TermVariation tv) {
        if (historyResource.getHistory().isWatched(source.getGroupingKey()))
            historyResource.getHistory().saveEvent(source.getGroupingKey(), this.getClass(),
                    "Term has a new variation: " + tv);

        if (historyResource.getHistory().isWatched(target.getGroupingKey()))
            historyResource.getHistory().saveEvent(target.getGroupingKey(), this.getClass(),
                    "Term has a new variation base: " + tv);
    }

    private void checkFrequency(Term term) {
        if (term.getFrequency() == 0)
            LOGGER.warn("Frequency of term {} must be greater than 0 before running SyntactticTermGatherer AE",
                    term.getGroupingKey());
    }

    @Override
    public void process(JCas cas) throws AnalysisEngineProcessException {
        // nothing to do at cas level
    }

    //   private static Comparator<Term> baseTargetComparator = new Comparator<Term>() {
    //      public int compare(Term a, Term b) {
    //           return ComparisonChain.start()
    //               .compare(b.getFrequency(), a.getFrequency())
    //               .compare(a.getWords().size(), b.getWords().size())
    //               .result();
    //         }
    //   };

}