org.phenotips.vocabulary.AbstractCSVAnnotationsExtension.java Source code

Introduction

Here is the source code for org.phenotips.vocabulary.AbstractCSVAnnotationsExtension.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.vocabulary;

import org.xwiki.stability.Unstable;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import javax.annotation.Nonnull;
import javax.inject.Inject;

import org.apache.commons.collections4.MultiValuedMap;
import org.apache.commons.collections4.multimap.ArrayListValuedHashMap;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.slf4j.Logger;

/**
 * Implements {@link VocabularyExtension} to annotate {@link VocabularyInputTerm} from {@link #getTargetVocabularyIds
 * supported vocabularies} with data from {@link #getAnnotationSource a tab- or comma-separated file}. The default
 * behavior implemented in this base class is to gather data from the named columns in the file, and add this data to
 * the respective terms when reindexing a supported vocabulary. Setting up the names of the columns is done by the
 * concrete class, either by {@link #setupCSVParser telling} the CSV parser to treat the first row as the header
 * definition, or by explicitly assigning names to columns.
 * <p>
 * To let the first row be parsed as the column names:
 * </p>
 *
 * <pre>
 * {@code
 *   protected CSVFormat setupCSVParser(Vocabulary vocabulary)
 *   {
 *       return CSVFormat.TDF.withHeader();
 *   }
 * }
 * </pre>
 * <p>
 * To explicitly name columns:
 * </p>
 *
 * <pre>
 * {@code
 *   protected CSVFormat setupCSVParser(Vocabulary vocabulary)
 *   {
 *       return CSVFormat.TDF.withHeader("id", null, "symptom");
 *   }
 * }
 * </pre>
 * <p>
 * With the default implementation of {@link #processCSVRecordRow the row processing function}, having a column named
 * {@code id} is mandatory.
 * </p>
 * <p>
 * Columns that are not named are ignored.
 * </p>
 * <p>
 * Missing, empty, or whitespace-only cells will be ignored.
 * </p>
 * <p>
 * If multiple rows for the same term identifier exists, then the values are accumulated in lists of values.
 * </p>
 * <p>
 * If one or more of the fields parsed happen to already have values already in the term being extended, then the
 * existing values will be discarded and replaced with the data read from the input file.
 * </p>
 * <p>
 * If multiple rows for the same term identifier exists, then the values are accumulated in lists of values. If in the
 * schema definition a field is set as non-multi-valued, then it's the responsibility of the user to make sure that only
 * one value will be specified for such fields. If a value is specified multiple times in the input file, then it will
 * be added multiple times in the field.
 * </p>
 * <p>
 * Example: for the following parser set-up:
 * </p>
 *
 * <pre>
 * {@code
 * CSVFormat.CSV.withHeader("id", null, "symptom", null, "frequency")
 * }
 * </pre>
 *
 * and the following input file:
 *
 * <pre>
 * {@code
 * MIM:162200,"NEUROFIBROMATOSIS, TYPE I",HP:0009737,"Lisch nodules",HP:0040284,HPO:curators
 * MIM:162200,"NEUROFIBROMATOSIS, TYPE I",HP:0001256,"Intellectual disability, mild",HP:0040283,HPO:curators
 * MIM:162200,"NEUROFIBROMATOSIS, TYPE I",HP:0000316,"Hypertelorism",,HPO:curators
 * MIM:162200,"NEUROFIBROMATOSIS, TYPE I",HP:0000501,"Glaucoma",HP:0040284,HPO:curators
 * }
 * </pre>
 *
 * the following fields will be added:
 * <dl>
 * <dt>{@code "symptom"}</dt>
 * <dd>{@code "HP:0009737"}, {@code HP:0001256}</dd>
 * <dt>{@code "frequency"}</dt>
 * <dd>{@code "HP:0040284"}, {@code HP:0040283}, {@code "HP:0040284"}</dd>
 * </dl>
 *
 * @version $Id: 75c6a3e5375e7baf1a50ea2af2896c423498015a $
 * @since 1.3
 */
@Unstable("New API introduced in 1.3")
public abstract class AbstractCSVAnnotationsExtension implements VocabularyExtension {
    protected static final String ID_KEY = "id";

    /**
     * Data read from the source file. The key of the outer map is the identifier of the term being extended, and the
     * value of the outer map is the data to add to the term. The key of the inner map is the name of the field, while
     * the value of the inner map is the values to add to that field.
     */
    protected Map<String, MultiValuedMap<String, String>> data = new HashMap<>();

    /** Logging helper object. */
    @Inject
    protected Logger logger;

    private AtomicInteger operationsInProgress = new AtomicInteger(0);

    @Override
    public boolean isVocabularySupported(@Nonnull final Vocabulary vocabulary) {
        return getTargetVocabularyIds().contains(vocabulary.getIdentifier());
    }

    @Override
    public void indexingStarted(@Nonnull final Vocabulary vocabulary) {
        if (this.operationsInProgress.incrementAndGet() == 1) {
            this.data = new HashMap<>();
            try (BufferedReader in = new BufferedReader(new InputStreamReader(
                    new URL(getAnnotationSource()).openConnection().getInputStream(), StandardCharsets.UTF_8))) {
                CSVFormat parser = setupCSVParser(vocabulary);
                for (final CSVRecord row : parser.parse(in)) {
                    processCSVRecordRow(row, vocabulary);
                }
            } catch (final IOException ex) {
                this.logger.error("Failed to load annotation source: {}", ex.getMessage());
            }
        }
    }

    @Override
    public void extendTerm(VocabularyInputTerm term, Vocabulary vocabulary) {
        MultiValuedMap<String, String> termData = this.data.get(term.getId());
        if (termData == null || termData.isEmpty()) {
            return;
        }
        for (Map.Entry<String, Collection<String>> datum : termData.asMap().entrySet()) {
            if (!datum.getValue().isEmpty()) {
                term.set(datum.getKey(), datum.getValue());
            }
        }
    }

    @Override
    public void indexingEnded(Vocabulary vocabulary) {
        if (this.operationsInProgress.decrementAndGet() == 0) {
            this.data = null;
        }
    }

    @Override
    public void extendQuery(SolrQuery query, Vocabulary vocabulary) {
        // The base extension doesn't change queries in any way, assuming that the extra fields are only to be stored or
        // explicitly queried, not queried automatically. Override if new fields must automatically be included in
        // queries.
    }

    /**
     * Processes and caches the row data. By default, it simply copies every mapped value from the row. Override if
     * further processing of the data is needed.
     *
     * @param row the {@link CSVRecord data row} to process
     * @param vocabulary the vocabulary being indexed
     */
    protected void processCSVRecordRow(final CSVRecord row, final Vocabulary vocabulary) {
        Map<String, String> csvData = row.toMap();
        MultiValuedMap<String, String> termData = this.data.get(row.get(ID_KEY));
        if (termData == null) {
            termData = new ArrayListValuedHashMap<>();
            this.data.put(row.get(ID_KEY), termData);
        }
        for (Map.Entry<String, String> item : csvData.entrySet()) {
            if (!ID_KEY.equals(item.getKey()) && StringUtils.isNoneBlank(item.getKey(), item.getValue())) {
                termData.put(item.getKey(), item.getValue());
            }
        }
    }

    /**
     * Helper method that gets the cell on the specified column, as string, if it exists, without throwing exceptions.
     *
     * @param row the {@link CSVRecord row} currently being processed
     * @param colNumber the number of the column of interest
     * @return the value on the target column, if such value exists, {@code null} otherwise
     */
    protected String getRowItem(@Nonnull final CSVRecord row, final int colNumber) {
        if (colNumber < row.size()) {
            return row.get(colNumber);
        }
        return null;
    }

    /**
     * Specifies the vocabularies targeted by this extension.
     *
     * @return a collection of valid {@link Vocabulary#getIdentifier() vocabulary identifiers}
     */
    protected abstract Collection<String> getTargetVocabularyIds();

    /**
     * Specifies the annotation source URL. This can be either a web (http) link, a network file, a local
     * {@code file:///} reference, a classpath resource, or any other URL format that can be read by the JVM.
     *
     * @return a valid annotation source URL
     */
    protected abstract String getAnnotationSource();

    /**
     * <p>
     * Sets up a CSV parser so that it accepts the format of the input file, and has names for each column of interest.
     * Giving names to columns is mandatory if the default implementation of {@link #processCSVRecordRow} is used. A
     * column named {@code id} holding the identifier of the target term is required, and only named columns will be
     * automatically extracted as data to add to each {@link #extendTerm extended term}. For example:
     * {@code return CSVFormat.TDF.withHeader("id", null, "symptom")}.
     * </p>
     * <p>
     * If the file has the first row as a header, the it can be automatically parsed as column names with
     * {@code return CSVFormat.TDF.withHeader()}.
     * <p>
     * Columns that aren't mapped, or are mapped to {@code null} or the empty string, will be ignored.
     * <p>
     * If a custom implementation of {@link #processCSVRecordRow} that doesn't rely on named columns is used, then
     * simply specifying the format of the file is enough, for example {@code return CSVFormat.CSV} or
     * {@code return CSVFormat.TDF.withSkipHeaderRecord().withCommentMarker('#')}.
     * </p>
     *
     * @param vocabulary the identifier of the vocabulary being indexed
     * @return a CSV parser that can read the annotation file
     */
    protected abstract CSVFormat setupCSVParser(Vocabulary vocabulary);
}