org.gbif.refine.datasets.ebd.ShoreBirdCensus.java Source code

Introduction

Here is the source code for org.gbif.refine.datasets.ebd.ShoreBirdCensus.java
Source

package org.gbif.refine.datasets.ebd;

import org.gbif.api.model.checklistbank.NameUsageMatch;
import org.gbif.api.service.checklistbank.NameUsageMatchingService;
import org.gbif.api.vocabulary.Rank;
import org.gbif.io.CSVReader;
import org.gbif.io.CSVReaderFactory;
import org.gbif.refine.client.WebserviceClientModule;
import org.gbif.refine.utils.Constants;
import org.gbif.refine.utils.FileUtils;
import org.gbif.refine.utils.TermUtils;
import org.gbif.utils.file.ClosableReportingIterator;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Set;

import javax.validation.constraints.NotNull;

import com.google.common.base.Strings;
import com.google.common.collect.Sets;
import org.codehaus.jettison.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class used to clean, augment, and transform the original ShoreBirdCensus_1993.csv dataset published by EBD-CSIC
 * into a DwC sample-based, star format with event records with associated occurrences.
 */
public class ShoreBirdCensus {
    private static final Logger LOG = LoggerFactory.getLogger(ShoreBirdCensus.class);

    private static final NameUsageMatchingService MATCHING_SERVICE = WebserviceClientModule
            .webserviceClientReadOnly().getInstance(NameUsageMatchingService.class);

    public static void main(String[] args) throws IOException {
        // directory where files should be written to
        File output = org.gbif.utils.file.FileUtils.createTempDir();
        processShoreBirdCensus(output);
        LOG.info("Processing ShoreBirdCensus_1993.csv complete! event.txt and occurrence.txt written to: "
                + output.getAbsolutePath());
    }

    /**
     * Iterates over original source file and does the following:
     * i) cleans it (e.g. converting dates to ISO format, matching scientific names to GBIF Backbone Taxonomy)
     * ii) augments it (e.g. adds occurrenceID, higher taxonomy columns, etc)
     * iii) transforms it into star format (two files events.txt list of unique sampling events and occurrence.txt a
     * list of all observations from all sampling events)
     *
     * @param output directory to write files to
     *
     * @throws IOException if method fails
     */
    public static void processShoreBirdCensus(File output) throws IOException {
        // load the original source file to process
        InputStream fis = ShoreBirdCensus.class.getResourceAsStream("/datasets/ebd/ShoreBirdCensus_1993.csv");

        // create an iterator on the file
        CSVReader reader = CSVReaderFactory.build(fis, "Latin1", ";", '"', 1);

        // get header row for the new event and occurrence files that this method will output
        String[] header = getHeader();

        // sampling events file
        Writer writerEvents = FileUtils.startEventsFile(output, header);

        // observations file
        Writer writerOccs = FileUtils.startOccurrencesFile(output, header);

        // to capture all unique eventIDs
        Set<String> events = Sets.newHashSet();

        ClosableReportingIterator<String[]> iter = null;
        int line = 0;
        try {
            iter = reader.iterator();
            while (iter.hasNext()) {
                line++;
                String[] record = iter.next();
                if (record == null || record.length == 0) {
                    continue;
                }

                // create new augmented record
                String[] modifiedRecord = Arrays.copyOf(record, header.length);

                // unique eventID
                String eventID = Strings.nullToEmpty(modifiedRecord[0]);

                // unique occurrenceID (institutionCode:datasetID:sequentialID)
                modifiedRecord[25] = "EBD-CSIC:SP_SI001346_ShoreBirdCensus:" + eventID + ":" + String.valueOf(line);

                // add static values
                modifiedRecord[24] = "EBD-CSIC";
                modifiedRecord[26] = "http://creativecommons.org/licenses/by/4.0/legalcode";
                modifiedRecord[27] = "Event";
                modifiedRecord[28] = "HumanObservation";
                modifiedRecord[38] = "square_kilometre";
                modifiedRecord[30] = "individuals";
                modifiedRecord[31] = "es";
                modifiedRecord[32] = "SP_SI001346_ShoreBirdCensus";
                modifiedRecord[33] = "0";
                modifiedRecord[34] = "5";
                modifiedRecord[35] = "Band census";
                modifiedRecord[39] = "Spain";
                modifiedRecord[40] = "ES";
                modifiedRecord[41] = "Huelva";
                modifiedRecord[42] = "Parque Nacional de Doana";

                // convert date (column #1) into ISO format
                String d = modifiedRecord[1];
                DateFormat df = new SimpleDateFormat("dd-MMM-yy", new Locale("es", "ES"));
                Date date = df.parse(d);
                modifiedRecord[1] = Constants.ISO_DF.format(date);

                // create eventTime (column #11)
                String startTime = record[6];
                String endTime = record[7];
                if (!Strings.isNullOrEmpty(startTime) && !Strings.isNullOrEmpty(endTime) && !startTime.equals("-")
                        && !endTime.equals("-")) {
                    modifiedRecord[6] = startTime + "+01";
                    modifiedRecord[7] = endTime + "+01";
                    modifiedRecord[12] = startTime + "+01/" + endTime + "+01";
                }

                // create dynamicProperties (column #12)
                JSONObject jo = new JSONObject();
                if (!Strings.isNullOrEmpty(modifiedRecord[8]) && !modifiedRecord[8].equals("-")) {
                    jo.put("cloudiness", modifiedRecord[8]);
                }
                if (!Strings.isNullOrEmpty(modifiedRecord[9]) && !modifiedRecord[9].equals("-")) {
                    jo.put("wind speed", modifiedRecord[9]);
                }
                if (!Strings.isNullOrEmpty(modifiedRecord[10]) && !modifiedRecord[10].equals("-")) {
                    jo.put("wind direction", modifiedRecord[10]);
                }
                if (!Strings.isNullOrEmpty(modifiedRecord[11]) && !modifiedRecord[11].equals("-")) {
                    jo.put("waves", modifiedRecord[11]);
                }
                if (jo.toString().length() > 5) {
                    modifiedRecord[13] = jo.toString();
                }

                // add higher taxonomy
                String name = modifiedRecord[3];
                NameUsageMatch match = MATCHING_SERVICE.match(name, Rank.SPECIES, null, false, false);
                if (match.getMatchType().equals(NameUsageMatch.MatchType.EXACT)) {
                    modifiedRecord[14] = match.getKingdom();
                    modifiedRecord[15] = match.getPhylum();
                    modifiedRecord[16] = match.getClazz();
                    modifiedRecord[17] = match.getOrder();
                    modifiedRecord[18] = match.getFamily();
                    modifiedRecord[19] = match.getGenus();

                    // specificEpithet
                    if (match.getSpecies() != null) {
                        String[] parts = match.getSpecies().split(" ");
                        if (parts.length == 2) {
                            modifiedRecord[20] = parts[1];
                        }
                    }

                    modifiedRecord[21] = match.getScientificName();
                    modifiedRecord[22] = "species";
                    modifiedRecord[23] = match.getStatus().toString();
                } else if (name.equals("Sterna sp.")) {
                    LOG.error("Handling special case for Sterna sp.");
                    modifiedRecord[19] = "Sterna";
                    modifiedRecord[21] = "Sterna sp.";
                } else {
                    LOG.error("No exact match for: " + name);
                }

                // depending on locationID, set WKT representation and sampleSizeValue of area being sampled
                // also create one sampling event per location sampled
                String locationID = modifiedRecord[2];
                if (locationID.equalsIgnoreCase("cama")) {
                    modifiedRecord[29] = "POLYGON((-6.529167 36.981904, -6.470398 36.981904, -6.470398 36.931576, -6.529167 36.931576, -6.529167 36.981904))";
                    modifiedRecord[37] = "4";
                    modifiedRecord[0] = modifiedRecord[0] + "-CAMA";
                } else if (locationID.equalsIgnoreCase("zaca")) {
                    modifiedRecord[29] = "POLYGON((-6.470398 36.931576, -6.429026 36.931576, -6.429026 36.873193, -6.470398 36.873193, -6.470398 36.931576))";
                    modifiedRecord[37] = "2.1";
                    modifiedRecord[0] = modifiedRecord[0] + "-ZACA";
                } else if (locationID.equalsIgnoreCase("maza")) {
                    modifiedRecord[29] = "POLYGON((-6.429026 36.873193, -6.346964 36.873193, -6.346964 36.804956, -6.429026 36.804956, -6.429026 36.873193))";
                    modifiedRecord[37] = "2.4";
                    modifiedRecord[0] = modifiedRecord[0] + "-MAZA";
                } else {
                    LOG.error("Line " + line + " has no location!!");
                }

                // occurrenceStatus (present vs absent)
                modifiedRecord[36] = TermUtils.getOccurrenceStatus(Integer.valueOf(modifiedRecord[4])).toString()
                        .toLowerCase();

                // always output line to new occurrences file
                String row = FileUtils.tabRow(modifiedRecord);
                writerOccs.write(row);

                // only output line to events file if event hasn't been included yet
                if (!events.contains(eventID)) {
                    writerEvents.write(row);
                    events.add(eventID);
                }
            }
            LOG.info("Iterated over " + line + " rows.");
            LOG.info("Found " + events.size() + " unique events.");
        } catch (Exception e) {
            // some error validating this file, report
            LOG.error("Exception caught while iterating over file", e);
        } finally {
            if (iter != null) {
                iter.close();
            }
            reader.close();
            writerEvents.close();
            writerOccs.close();
        }
    }

    /**
     * @return array of column names in output files (event.txt, occurrence.txt)
     */
    @NotNull
    private static String[] getHeader() {
        String[] header = new String[43];
        header[0] = "eventID";
        header[1] = "eventDate";
        header[2] = "locationID";
        header[3] = "name";
        header[4] = "organismQuantity";
        header[5] = "occurrenceRemarks";

        // combined into eventTime - column 12
        header[6] = "startTime";
        header[7] = "endTime";

        // combined into dynamicProperties - column 13
        header[8] = "cloudiness";
        header[9] = "wind speed";
        header[10] = "wind direction";
        header[11] = "waves";

        header[12] = "eventTime";
        header[13] = "dynamicProperties";

        // higher taxonomy
        header[14] = "kingdom";
        header[15] = "phylum";
        header[16] = "class";
        header[17] = "order";
        header[18] = "family";
        header[19] = "genus";
        header[20] = "specificEpithet";
        header[21] = "scientificName";
        header[22] = "taxonRank";
        header[23] = "taxonomicStatus";

        // other
        header[24] = "institutionCode";
        header[25] = "occurrenceID";
        header[26] = "license";
        header[27] = "type";
        header[28] = "basisOfRecord";
        header[29] = "footprintWKT";
        header[30] = "organismQuantityType";
        header[31] = "language";
        header[32] = "datasetID";
        header[33] = "minimumElevationInMeters";
        header[34] = "maximumElevationInMeters";
        header[35] = "samplingProtocol";
        header[36] = "occurrenceStatus";
        header[37] = "sampleSizeValue";
        header[38] = "sampleSizeUnit";
        header[39] = "country";
        header[40] = "countryCode";
        header[41] = "stateProvince";
        header[42] = "locality";

        return header;
    }
}