org.gbif.refine.datasets.nhmd.RooftopBugs.java Source code

Introduction

Here is the source code for org.gbif.refine.datasets.nhmd.RooftopBugs.java
Source

package org.gbif.refine.datasets.nhmd;

import org.gbif.api.model.checklistbank.NameUsage;
import org.gbif.api.model.checklistbank.NameUsageMatch;
import org.gbif.api.model.common.LinneanClassification;
import org.gbif.api.service.checklistbank.NameUsageMatchingService;
import org.gbif.api.vocabulary.Rank;
import org.gbif.io.CSVReader;
import org.gbif.io.CSVReaderFactory;
import org.gbif.refine.client.WebserviceClientModule;
import org.gbif.refine.utils.Constants;
import org.gbif.refine.utils.FileUtils;
import org.gbif.refine.utils.TermUtils;
import org.gbif.utils.file.ClosableReportingIterator;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.validation.constraints.NotNull;

import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class is used to clean, augment, and transform the original RooftopBugs dataset published by the Natural
 * History Museum of Denmark into a DwC sample-based, star format with event records with associated occurrences.
 */
public class RooftopBugs {

    private static final Logger LOG = LoggerFactory.getLogger(RooftopBugs.class);
    private static Map<String, NameUsage> names;
    private static Set<String> events;
    private static Set<String> validColeopteraNamesNotInNub;

    private static final NameUsageMatchingService MATCHING_SERVICE = WebserviceClientModule
            .webserviceClientReadOnly().getInstance(NameUsageMatchingService.class);

    private static String lepidopteraEventsFileName = "events-lepidoptera.tab";
    private static String lepidopteraOccurrencesFileName = "occurrences-lepidoptera.tab";
    private static String coleopteraOccurrencesFileName = "occurrences-coleoptera.tab";

    public static void main(String[] args) throws IOException {
        // load list of all taxa
        names = loadTaxaList();
        LOG.info("Loaded " + names.size() + " unique canonical names.");

        // set of eventIDs
        events = Sets.newHashSet();

        // valid verified names not existing in GBIF Backbone Taxonomy (Nub)
        validColeopteraNamesNotInNub = Collections.unmodifiableSet(
                Sets.newHashSet("Acanthocinus griseus (Fabricius, 1792)", "Aphodius rufipes (Linnaeus, 1758)",
                        "Aphodius rufus (Moll, 1782)", "Aphodius sordidus (Fabricius, 1775)",
                        "Curculio glandium Marsham, 1802", "Curculio nucum Linnaeus, 1758",
                        "Dorytomus rufatus (Bedel, 1886)", "Dorytomus taeniatus (Fabricius, 1781)",
                        "Hylobius abietis (Linnaeus, 1758)", "Magdalis barbicornis (Latreille, 1804)",
                        "Magdalis ruficornis (Linnaeus, 1758)", "Phytobius leucogaster (Marsham, 1802)"));

        // create directory where files should be written to
        File output = org.gbif.utils.file.FileUtils.createTempDir();

        // first, process all Lepidoptera records (order is important)
        processLepidoptera(output);
        LOG.info("Processing Lepidoptera_1992-2009.csv complete! " + lepidopteraEventsFileName + " and "
                + lepidopteraOccurrencesFileName + " written to: " + output.getAbsolutePath());

        // second, process all Coleoptera record
        processColeoptera(output);
        LOG.info("Processing Coleoptera_1992-2009.csv complete! " + coleopteraOccurrencesFileName + " written to: "
                + output.getAbsolutePath());
    }

    /**
     * Loads a list of taxa from TaxaList.csv: each taxon gets converted into a NameUsage, and stored in a Map
     * organised by its canonical name (name without authorship).
     */
    public static Map<String, NameUsage> loadTaxaList() throws IOException {
        // load the original source file to process
        InputStream fis = RooftopBugs.class.getResourceAsStream("/datasets/nhmd/TaxaList-v2.csv");

        // create an iterator on the file
        CSVReader reader = CSVReaderFactory.build(fis, "UTF-8", ";", '"', 1);

        // to capture all NameUsages into a map with their canonical name as key
        Map<String, NameUsage> names = Maps.newHashMap();

        ClosableReportingIterator<String[]> iter = null;
        int line = 0;
        try {
            iter = reader.iterator();
            while (iter.hasNext()) {
                line++;
                String[] record = iter.next();
                if (record == null || record.length == 0) {
                    continue;
                }

                NameUsage nameUsage = new NameUsage();
                String canonicalName = "";

                // column 0: taxonID
                String taxonID = Strings.nullToEmpty(record[0]);
                if (!Strings.isNullOrEmpty(taxonID)) {
                    nameUsage.setTaxonID(taxonID);
                }

                // column 1: shorthand rank
                String taxonRank = Strings.nullToEmpty(record[1]);
                if (!Strings.isNullOrEmpty(taxonRank)) {
                    // interpret Rank
                    if (taxonRank.equalsIgnoreCase("FAMIL")) {
                        nameUsage.setRank(Rank.FAMILY);
                    } else if (taxonRank.equalsIgnoreCase("SUBFA")) {
                        nameUsage.setRank(Rank.SUBFAMILY);
                    } else if (taxonRank.equalsIgnoreCase("TRIBU")) {
                        nameUsage.setRank(Rank.TRIBE);
                    } else if (taxonRank.equalsIgnoreCase("SUPER")) {
                        nameUsage.setRank(Rank.SUPERFAMILY);
                    } else if (taxonRank.equalsIgnoreCase("GENUS")) {
                        nameUsage.setRank(Rank.GENUS);
                    } else if (taxonRank.equalsIgnoreCase("SPECI")) {
                        nameUsage.setRank(Rank.SPECIES);
                    } else {
                        LOG.error("Failed to match shorthand rank: " + taxonRank);
                    }
                }

                // column 2: Genus when rank=species
                String part1 = Strings.nullToEmpty(record[2]).trim();
                if (!Strings.isNullOrEmpty(part1)) {
                    nameUsage.setGenus(part1);
                    canonicalName = canonicalName + part1;
                }

                // column 3: species (e.g. SpecificEpithet when rank=species)
                String part2 = Strings.nullToEmpty(record[3]).trim();
                if (!Strings.isNullOrEmpty(part2)) {
                    if (nameUsage.getRank() != null) {
                        nameUsage.setSpecies(part2);
                        canonicalName = canonicalName + " " + part2;
                    }
                }

                // column 4: scientificNameAuthorship
                String scientificNameAuthorship = Strings.nullToEmpty(record[4]);
                if (!Strings.isNullOrEmpty(scientificNameAuthorship)) {
                    nameUsage.setAuthorship(scientificNameAuthorship);
                }

                // name without authorship
                if (!Strings.isNullOrEmpty(canonicalName)) {
                    canonicalName = canonicalName.trim();
                    nameUsage.setCanonicalName(canonicalName);
                    if (!names.containsKey(canonicalName)) {
                        names.put(canonicalName, nameUsage);
                    } else {
                        LOG.warn("Map already contains NameUsage with canonical name: " + canonicalName);
                    }
                } else {
                    LOG.warn("Taxon has no canonical name - check line: " + String.valueOf(line));
                }
            }
            LOG.info("Iterated over " + line + " lines in TaxaList.csv.");
        } catch (Exception e) {
            // some error validating this file, report
            LOG.error("Exception caught while iterating over file", e);
        } finally {
            if (iter != null) {
                iter.close();
            }
            reader.close();
        }

        return names;
    }

    /**
     * Iterates over original source file and does the following:
     * i) cleans it (e.g. maps column header names to DwC term names, converts dates to ISO format, etc)
     * ii) augments it (e.g. adds new columns for sample size, higher taxonomy, etc)
     * iii) transforms it into star format (core file events.txt is list of unique sampling events, and extension file
     * occurrence.txt is a list of all observations derived from all sampling events)
     *
     * @param output directory to write files to
     *
     * @throws IOException if method fails
     */
    public static void processLepidoptera(File output) throws IOException {
        // load the original source file to process
        InputStream fis = RooftopBugs.class.getResourceAsStream("/datasets/nhmd/Lepidoptera_1992-2009-v3.csv");

        // create an iterator on the file
        CSVReader reader = CSVReaderFactory.build(fis, "UTF-8", ";", '"', 1);

        // get header row for the new event and occurrence files that this method will output
        String[] header = getLepidopteraHeader();

        // sampling events file
        Writer writerEvents = FileUtils.startEventsFile(output, header, lepidopteraEventsFileName);

        // observations file
        Writer writerOccs = FileUtils.startOccurrencesFile(output, header, lepidopteraOccurrencesFileName);

        // to capture bad names
        Set<String> namesNotFound = Sets.newTreeSet();

        ClosableReportingIterator<String[]> iter = null;
        int line = 0;
        try {
            iter = reader.iterator();
            while (iter.hasNext()) {
                line++;
                String[] record = iter.next();
                if (record == null || record.length == 0) {
                    continue;
                }

                // create new augmented record
                String[] modifiedRecord = Arrays.copyOf(record, header.length);

                // add static values
                modifiedRecord[16] = "The material sample was collected, and either preserved or destructively processed."; // eventRemarks
                modifiedRecord[17] = "Denmark"; // country
                modifiedRecord[18] = "DK"; // countryCode
                modifiedRecord[19] = "Light trap on rooftop of Zoological Museum, Natural History Museum of Denmark (ZMUC)"; // locality
                modifiedRecord[20] = "55.702512"; // decimalLatitude
                modifiedRecord[21] = "12.558956"; // decimalLongitude
                modifiedRecord[22] = "WGS84"; // geodeticDatum
                modifiedRecord[23] = "modified Robinson light trap"; // samplingProtocol
                modifiedRecord[25] = "day"; // sampleSizeUnit
                modifiedRecord[27] = "http://creativecommons.org/licenses/by/4.0/legalcode"; // license
                modifiedRecord[28] = "Event"; // type
                modifiedRecord[29] = "Zoological Museum, Natural History Museum of Denmark (ZMUC)"; // rightsHolder
                modifiedRecord[30] = "ZMUC"; // institutionCode
                modifiedRecord[31] = "ZMUC"; // ownerInstitutionCode
                modifiedRecord[33] = "MaterialSample"; // basisOfRecord
                modifiedRecord[34] = "Ole Karsholt"; // recordedBy
                modifiedRecord[35] = "Ole Karsholt"; // identifiedBy
                modifiedRecord[37] = "individuals"; // organismQuantityType
                modifiedRecord[39] = "Animalia"; // kingdom
                modifiedRecord[40] = "Arthropoda"; // phylum
                modifiedRecord[41] = "Insecta"; // class

                // store organismQuantity even though it's the same as individualCount
                modifiedRecord[36] = modifiedRecord[8]; // value copied from individualCount

                // occurrenceStatus (present vs absent)
                modifiedRecord[38] = TermUtils.getOccurrenceStatus(Integer.valueOf(modifiedRecord[8])).toString()
                        .toLowerCase();

                // convert start date (e.g. 21/08/94) into ISO format
                String start = modifiedRecord[2];
                DateFormat df = new SimpleDateFormat("dd/MM/yy", new Locale("dk", "DK"));
                Date startDate = df.parse(start);
                modifiedRecord[2] = Constants.ISO_DF.format(startDate);

                // convert end date (e.g. 21/08/94) into ISO format
                String end = modifiedRecord[3];
                Date endDate = df.parse(end);
                modifiedRecord[3] = Constants.ISO_DF.format(endDate);

                // combine start and end date into date range for eventDate
                modifiedRecord[15] = modifiedRecord[2] + "/" + modifiedRecord[3];

                // calculate samplingEffort in number of trap days
                long diff = endDate.getTime() - startDate.getTime();
                float days = diff / (24 * 60 * 60 * 1000);
                modifiedRecord[26] = String.valueOf(Math.round(days)) + " trap day(s)";

                // store sampleSize even though it's the same as samplingEffort
                modifiedRecord[24] = String.valueOf(Math.round(days));

                // eventID for this sampling period
                modifiedRecord[49] = constructEventID(modifiedRecord[15]);

                // find name in taxa list
                String name = modifiedRecord[4].trim();
                // only use canonical name in lookup
                String[] parts = name.split(" ");
                if (parts.length >= 2) {
                    String canonical = parts[0].trim();

                    // exclude "sp." from canonical name
                    String specificEpithet = parts[1].trim();
                    if (!specificEpithet.equals("sp.")) {
                        canonical += " " + specificEpithet;
                    }

                    NameUsage found = names.get(canonical);
                    if (found != null) {
                        modifiedRecord[43] = found.getGenus();
                        modifiedRecord[44] = found.getCanonicalName() + " " + found.getAuthorship();
                        modifiedRecord[45] = found.getAuthorship();
                        modifiedRecord[46] = (found.getRank() == null) ? null
                                : found.getRank().toString().toLowerCase();
                        modifiedRecord[47] = found.getTaxonID();

                        // names that changed store previous identification in "previousIdentifications"
                        if (canonical.equals("Bena bicolorana")) {
                            modifiedRecord[50] = "Bena prasinana L.";
                        } else if (canonical.equals("Pseudoips prasinana")) {
                            modifiedRecord[50] = "Pseudoips fagana F.";
                        }
                    } else {
                        if (!namesNotFound.contains(name)) {
                            namesNotFound.add(name);
                        }
                    }
                } else {
                    LOG.error("*****Bad species name encountered: " + name);
                }

                // construct unique occurrenceID for this abundance record:
                // Format: "urn:[institutionCode]:[startDate/endDate]:[taxonID]"
                // Example: "urn:zmuc:1994-08-12/1994-08-21:1301"
                modifiedRecord[32] = modifiedRecord[49] + ":" + modifiedRecord[47];

                // always output line to new occurrences file
                String row = FileUtils.tabRow(modifiedRecord);
                writerOccs.write(row);

                // only output line to events file if event hasn't been included yet
                if (!events.contains(modifiedRecord[49])) {
                    writerEvents.write(row);
                    events.add(modifiedRecord[49]);
                }
            }
            LOG.info("Iterated over " + line + " rows.");
            LOG.info("Found " + events.size() + " unique events.");

            LOG.warn("***** " + namesNotFound.size() + " names not found in taxa list: ");
            for (String notFound : namesNotFound) {
                LOG.warn(notFound);
            }

        } catch (Exception e) {
            // some error validating this file, report
            LOG.error("Exception caught while iterating over file", e);
        } finally {
            if (iter != null) {
                iter.close();
            }
            reader.close();
            writerEvents.close();
            writerOccs.close();
        }
    }

    /**
     * Iterates over original source file and does the following:
     * i) cleans it (e.g. maps column header names to DwC term names, converts dates to ISO format, etc)
     * ii) augments it (e.g. adds new columns for sample size, higher taxonomy, etc)
     * iii) transforms it into star format (core file events.txt is list of unique sampling events, and extension file
     * occurrence.txt is a list of all observations derived from all sampling events)
     *
     * @param output directory to write files to
     *
     * @throws IOException if method fails
     */
    public static void processColeoptera(File output) throws IOException {
        // load the original source file to process
        InputStream fis = RooftopBugs.class.getResourceAsStream("/datasets/nhmd/Coleoptera_1992-2009-v3.csv");

        // create an iterator on the file
        CSVReader reader = CSVReaderFactory.build(fis, "UTF-8", ";", '"', 1);

        // get header row for the new event and occurrence files that this method will output
        String[] header = getColeopteraHeader();

        // observations file
        Writer writerOccs = FileUtils.startOccurrencesFile(output, header, coleopteraOccurrencesFileName);

        // to capture bad names
        Set<String> namesNotFound = Sets.newTreeSet();

        ClosableReportingIterator<String[]> iter = null;
        int line = 0;
        try {
            iter = reader.iterator();
            while (iter.hasNext()) {
                line++;
                String[] record = iter.next();
                if (record == null || record.length == 0) {
                    continue;
                }

                // create new augmented record
                String[] modifiedRecord = Arrays.copyOf(record, header.length);

                // add static values
                modifiedRecord[16] = "The material sample was collected, and either preserved or destructively processed."; // eventRemarks
                modifiedRecord[17] = "Denmark"; // country
                modifiedRecord[18] = "DK"; // countryCode
                modifiedRecord[19] = "Light trap on rooftop of Zoological Museum, Natural History Museum of Denmark (ZMUC)"; // locality
                modifiedRecord[20] = "55.702512"; // decimalLatitude
                modifiedRecord[21] = "12.558956"; // decimalLongitude
                modifiedRecord[22] = "WGS84"; // geodeticDatum
                modifiedRecord[23] = "modified Robinson light trap"; // samplingProtocol
                modifiedRecord[25] = "day"; // sampleSizeUnit
                modifiedRecord[27] = "http://creativecommons.org/licenses/by/4.0/legalcode"; // license
                modifiedRecord[28] = "Event"; // type
                modifiedRecord[29] = "Zoological Museum, Natural History Museum of Denmark (ZMUC)"; // rightsHolder
                modifiedRecord[30] = "ZMUC"; // institutionCode
                modifiedRecord[31] = "ZMUC"; // ownerInstitutionCode
                modifiedRecord[33] = "MaterialSample"; // basisOfRecord
                modifiedRecord[34] = "Ole Karsholt"; // recordedBy
                modifiedRecord[37] = "individuals"; // organismQuantityType
                modifiedRecord[39] = "Animalia"; // kingdom
                modifiedRecord[40] = "Arthropoda"; // phylum
                modifiedRecord[41] = "Insecta"; // class

                // store organismQuantity even though it's the same as individualCount
                modifiedRecord[36] = modifiedRecord[6]; // value copied from individualCount

                // occurrenceStatus (present vs absent)
                modifiedRecord[38] = TermUtils.getOccurrenceStatus(Integer.valueOf(modifiedRecord[6])).toString()
                        .toLowerCase();

                // convert start date (e.g. 5/17/93) into ISO format
                String start = modifiedRecord[4];
                DateFormat df = new SimpleDateFormat("MM/dd/yy", new Locale("dk", "DK"));
                Date startDate = df.parse(start);
                modifiedRecord[4] = Constants.ISO_DF.format(startDate);

                // convert end date (e.g. 5/23/93) into ISO format
                String end = modifiedRecord[5];
                Date endDate = df.parse(end);
                modifiedRecord[5] = Constants.ISO_DF.format(endDate);

                // combine start and end date into date range for eventDate
                modifiedRecord[15] = modifiedRecord[4] + "/" + modifiedRecord[5];

                // calculate samplingEffort in number of trap days
                long diff = endDate.getTime() - startDate.getTime();
                float days = diff / (24 * 60 * 60 * 1000);
                modifiedRecord[26] = String.valueOf(Math.round(days)) + " trap day(s)";

                // store sampleSize even though it's the same as samplingEffort
                modifiedRecord[24] = String.valueOf(Math.round(days));

                // all Coleoptera recorded between 1992 and 1999 were identified by Michael Hansen, then Jan Pedersen took over
                Calendar c = Calendar.getInstance();
                c.set(Calendar.YEAR, 2000);
                c.set(Calendar.DAY_OF_YEAR, 1);
                Date mm = c.getTime();
                modifiedRecord[35] = (startDate.before(mm)) ? "Michael Hansen" : "Jan Pedersen"; // identifiedBy

                // eventID for this sampling period
                modifiedRecord[49] = constructEventID(modifiedRecord[15]);

                // verify taxonomy
                String name = modifiedRecord[2].trim();

                // for more accurate match, we take higher taxonomy into consideration
                LinneanClassification cl = new NameUsage();
                cl.setKingdom(modifiedRecord[39]); // static
                cl.setPhylum(modifiedRecord[40]); // static
                cl.setClazz(modifiedRecord[41]); // static
                cl.setOrder(modifiedRecord[0]);
                cl.setSpecies(name);

                // lowest rank specified
                Rank rank = TermUtils.lowestRank(cl);
                if (rank != null) {
                    modifiedRecord[46] = rank.toString();
                }

                // verify name, and add higher taxonomy
                NameUsageMatch match = MATCHING_SERVICE.match(name, rank, cl, false, false);
                if (validColeopteraNamesNotInNub.contains(name)) {
                    // skip
                } else if (match.getMatchType().equals(NameUsageMatch.MatchType.EXACT)) {
                    modifiedRecord[48] = match.getStatus().toString();
                    modifiedRecord[42] = match.getFamily();
                    modifiedRecord[43] = match.getGenus();
                    modifiedRecord[44] = match.getScientificName();
                    modifiedRecord[47] = match.getUsageKey().toString();
                } else {
                    if (!namesNotFound.contains(name)) {
                        LOG.error(match.getMatchType().toString() + " match for: " + name + " (with rank " + rank
                                + ") to: " + match.getScientificName() + " (with rank " + match.getRank() + ")");
                        namesNotFound.add(name);
                    }
                }

                // construct unique occurrenceID for this abundance record:
                // Format: "urn:[institutionCode]:[startDate/endDate]:coleoptera:[row#]"
                // Example: "urn:zmuc:1994-08-12/1994-08-21:coleoptera:1301"
                modifiedRecord[32] = modifiedRecord[49] + ":coleoptera:" + line;

                // always output line to new occurrences file
                String row = FileUtils.tabRow(modifiedRecord);
                writerOccs.write(row);

                // all Coleoptera sampling events are a subset of all Lepidoptera sampling events
                if (!events.contains(modifiedRecord[49])) {
                    LOG.error("Sampling event not found: " + modifiedRecord[49]);
                }
            }
            LOG.info("Iterated over " + line + " rows.");

            LOG.warn("***** " + namesNotFound.size() + " names not found in taxa list: ");
            for (String notFound : namesNotFound) {
                LOG.warn(notFound);
            }

        } catch (Exception e) {
            // some error validating this file, report
            LOG.error("Exception caught while iterating over file", e);
        } finally {
            if (iter != null) {
                iter.close();
            }
            reader.close();
            writerOccs.close();
        }
    }

    /**
     * @return array of column names in output files for Lepidoptera data (event.txt, occurrence.txt)
     */
    @NotNull
    private static String[] getLepidopteraHeader() {
        String[] header = new String[51];

        // ***original columns

        // header 0: order, e.g. LEPIDOPTERA
        // maps to dwc:order
        header[0] = "order";
        // header 1: group, e.g. ACROLEPIIDAE
        header[1] = "group";
        // header 2: date1, e.g. 12/08/94
        // converted to ISO format 1994-08-12
        header[2] = "date1";
        // header 3: date2, e.g. 21/08/94
        // converted to ISO format 1994-08-21
        header[3] = "date2";
        // header 4: name, e.g. Acrolepiopsis assectella Zell.:
        header[4] = "name";
        // header 5: year, e.g. 1994
        // maps to dwc:year
        header[5] = "year";
        // header 6: w, e.g. 1 ex 12.-21.viii.
        header[6] = "w";
        // header 7: i, e.g. 1
        header[7] = "i";
        // header 8: Antal, e.g. 1
        // Total abundance for species recorded during that trap event
        // maps to dwc:individualCount (must pair with dwc:organismQuantityType)
        header[8] = "individualCount";
        // header 9: month1, e.g. 8
        header[9] = "month1";
        // header 10: day1, e.g. 12
        header[10] = "day1";
        // header 11: month2, e.g. 8
        header[11] = "month2";
        // header 12: day2, e.g. 21
        header[12] = "day2";
        // header 13: no_one, e.g. 1
        header[13] = "no_one";

        // ***new augmented columns of information

        // header 14: identificationRemarks, e.g. "Either species a or b"
        header[14] = "identificationRemarks";
        // eventDate range, e.g. 1994-08-12/1994-08-21
        header[15] = "eventDate";
        // The material sample was collected, and either preserved or destructively processed.
        header[16] = "eventRemarks";
        // Denmark
        header[17] = "country";
        // DK
        header[18] = "countryCode";
        // Rooftop of Natural History Museum of Denmark
        header[19] = "locality";
        // 55702512
        header[20] = "decimalLatitude";
        // 12558956
        header[21] = "decimalLongitude";
        // WGS84
        header[22] = "geodeticDatum";
        // modified Robinson light trap
        header[23] = "samplingProtocol";
        // time duration in number of trap days
        header[24] = "sampleSizeValue";
        // day
        header[25] = "sampleSizeUnit";
        // number of trap days
        header[26] = "samplingEffort";
        // http://creativecommons.org/licenses/by/4.0/legalcode
        header[27] = "license";
        // Event
        header[28] = "type";
        // Natural History Museum of Denmark
        header[29] = "rightsHolder";
        // ZMUC
        header[30] = "institutionCode";
        // ZMUC
        header[31] = "ownerInstitutionCode";
        // unique occurrenceID
        header[32] = "occurrenceID";
        // MaterialSample
        header[33] = "basisOfRecord";
        // Ole Karsholt
        header[34] = "recordedBy";
        // Ole Karsholt
        header[35] = "identifiedBy";
        // copied from individualCount
        header[36] = "organismQuantity";
        // individuals
        header[37] = "organismQuantityType";
        // present or absent - depending on individualCount
        header[38] = "occurrenceStatus";
        // taxonomy
        header[39] = "kingdom";
        header[40] = "phylum";
        header[41] = "class";
        header[42] = "gbif_family";
        header[43] = "genus";
        header[44] = "scientificName";
        header[45] = "scientificNameAuthorship";
        header[46] = "taxonRank";
        header[47] = "taxonID";
        header[48] = "gbif_taxonomicStatus";

        // unique eventID
        header[49] = "eventID";
        // to capture name change
        header[50] = "previousIdentifications";

        // TODO: minimum/maximumElevationInMeters

        return header;
    }

    /**
     * @return array of column names in output files for Coleoptera data (event.txt, occurrence.txt)
     */
    @NotNull
    private static String[] getColeopteraHeader() {
        String[] header = new String[51];

        // ***original columns

        // header 0: order, e.g. COLEOPTERA
        // maps to dwc:order
        header[0] = "order";
        // header 1: group, e.g. ADERIDAE
        header[1] = "group";
        // header 2: name, e.g. Aderus populneus (Creutzer)
        // maps to dwc:scientificName
        header[2] = "scientificName";
        // header 3: year, e.g. 1993
        // maps to dwc:year
        header[3] = "year";
        // header 4: date1, e.g. 5/17/93
        // converted to ISO format 1993-05-17
        header[4] = "date1";
        // header 5: date2, e.g. 5/23/93
        // converted to ISO format 1994-05-23
        header[5] = "date2";
        // header 6: individuals, e.g. 1
        // Total abundance for species recorded during that trap event
        // maps to dwc:individualCount (must pair with dwc:organismQuantityType)
        header[6] = "individualCount";
        // header 7: month1, e.g. 5
        header[7] = "month1";
        // header 8: day1, e.g. 17
        header[8] = "day1";
        // header 9: month2, e.g. 5
        header[9] = "month2";
        // header 10: day2, e.g. 23
        header[10] = "day2";
        // header 11: startday, e.g. 137
        header[11] = "startday";
        // header 12: endday, e.g. 143
        header[12] = "endday";
        // header 13: diff, e.g. 7
        header[13] = "diff";
        // header 14: newname, e.g. 44
        header[14] = "newname";

        // ***new augmented columns of information

        // eventDate range, e.g. 1994-08-12/1994-08-21
        header[15] = "eventDate";
        // The material sample was collected, and either preserved or destructively processed.
        header[16] = "eventRemarks";
        // Denmark
        header[17] = "country";
        // DK
        header[18] = "countryCode";
        // Rooftop of Natural History Museum of Denmark
        header[19] = "locality";
        // 55702512
        header[20] = "decimalLatitude";
        // 12558956
        header[21] = "decimalLongitude";
        // WGS84
        header[22] = "geodeticDatum";
        // modified Robinson light trap
        header[23] = "samplingProtocol";
        // time duration in number of trap days
        header[24] = "sampleSizeValue";
        // day
        header[25] = "sampleSizeUnit";
        // number of trap days
        header[26] = "samplingEffort";
        // http://creativecommons.org/licenses/by/4.0/legalcode
        header[27] = "license";
        // Event
        header[28] = "type";
        // Natural History Museum of Denmark
        header[29] = "rightsHolder";
        // ZMUC
        header[30] = "institutionCode";
        // ZMUC
        header[31] = "ownerInstitutionCode";
        // unique occurrenceID
        header[32] = "occurrenceID";
        // MaterialSample
        header[33] = "basisOfRecord";
        // Ole Karsholt
        header[34] = "recordedBy";
        // Ole Karsholt
        header[35] = "identifiedBy";
        // copied from individualCount
        header[36] = "organismQuantity";
        // individuals
        header[37] = "organismQuantityType";
        // present or absent - depending on individualCount
        header[38] = "occurrenceStatus";
        // taxonomy
        header[39] = "kingdom";
        header[40] = "phylum";
        header[41] = "class";
        header[42] = "gbif_family";
        header[43] = "gbif_genus";
        header[44] = "gbif_scientificName";
        header[45] = "gbif_scientificNameAuthorship";
        header[46] = "taxonRank";
        header[47] = "gbif_taxonID";
        header[48] = "gbif_taxonomicStatus";

        // unique eventID
        header[49] = "eventID";
        // to capture name change
        header[50] = "previousIdentifications";

        // TODO: minimum/maximumElevationInMeters

        return header;
    }

    /**
     * Construct unique eventID for this sampling period using format: "urn:[institutionID]:[startDate/endDate]". E.g.
     * "urn:zmuc:1994-08-12/1994-08-21"
     *
     * @param eventDate event date
     *
     * @return eventID
     */
    private static String constructEventID(@NotNull String eventDate) {
        return "urn:zmuc:" + eventDate;
    }
}