nl.knaw.huygens.timbuctoo.tools.importer.ckcc.CKCCImporter.java Source code

Java tutorial

Introduction

Here is the source code for nl.knaw.huygens.timbuctoo.tools.importer.ckcc.CKCCImporter.java

Source

package nl.knaw.huygens.timbuctoo.tools.importer.ckcc;

/*
 * #%L
 * Timbuctoo tools
 * =======
 * Copyright (C) 2012 - 2015 Huygens ING
 * =======
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 3 of the 
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public 
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-3.0.html>.
 * #L%
 */

import java.io.File;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.Set;

import nl.knaw.huygens.tei.DelegatingVisitor;
import nl.knaw.huygens.tei.Document;
import nl.knaw.huygens.tei.Element;
import nl.knaw.huygens.tei.ElementHandler;
import nl.knaw.huygens.tei.Traversal;
import nl.knaw.huygens.tei.Visitor;
import nl.knaw.huygens.tei.XmlContext;
import nl.knaw.huygens.tei.handlers.DefaultElementHandler;
import nl.knaw.huygens.timbuctoo.Repository;
import nl.knaw.huygens.timbuctoo.config.TypeNames;
import nl.knaw.huygens.timbuctoo.index.IndexManager;
import nl.knaw.huygens.timbuctoo.model.Collective;
import nl.knaw.huygens.timbuctoo.model.Location;
import nl.knaw.huygens.timbuctoo.model.Person;
import nl.knaw.huygens.timbuctoo.model.Reference;
import nl.knaw.huygens.timbuctoo.model.RelationType;
import nl.knaw.huygens.timbuctoo.model.ckcc.CKCCCollective;
import nl.knaw.huygens.timbuctoo.model.ckcc.CKCCDocument;
import nl.knaw.huygens.timbuctoo.model.ckcc.CKCCPerson;
import nl.knaw.huygens.timbuctoo.model.ckcc.CKCCRelation;
import nl.knaw.huygens.timbuctoo.model.util.Datable;
import nl.knaw.huygens.timbuctoo.model.util.FloruitPeriod;
import nl.knaw.huygens.timbuctoo.model.util.Link;
import nl.knaw.huygens.timbuctoo.model.util.PersonName;
import nl.knaw.huygens.timbuctoo.model.util.PersonNameComponent;
import nl.knaw.huygens.timbuctoo.model.util.RelationBuilder;
import nl.knaw.huygens.timbuctoo.storage.ValidationException;
import nl.knaw.huygens.timbuctoo.tools.config.ToolsInjectionModule;
import nl.knaw.huygens.timbuctoo.tools.importer.CSVImporter;
import nl.knaw.huygens.timbuctoo.tools.importer.CaptureHandler;
import nl.knaw.huygens.timbuctoo.tools.importer.DefaultImporter;
import nl.knaw.huygens.timbuctoo.tools.importer.neww.LocationConcordance;
import nl.knaw.huygens.timbuctoo.util.Files;
import nl.knaw.huygens.timbuctoo.util.Text;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Splitter;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Sets;
import com.google.inject.Injector;

/**
 * Imports data of the CKCC project into the repository.
 */
public class CKCCImporter extends DefaultImporter {

    private static final Logger LOG = LoggerFactory.getLogger(CKCCImporter.class);

    public static void main(String[] args) throws Exception {
        Stopwatch stopWatch = Stopwatch.createStarted();

        // Handle commandline arguments
        String directory = (args.length > 0) ? args[0] : "../../timbuctoo-testdata/src/main/resources/ckcc/";

        CKCCImporter importer = null;
        try {
            Injector injector = ToolsInjectionModule.createInjector();
            Repository repository = injector.getInstance(Repository.class);
            IndexManager indexManager = injector.getInstance(IndexManager.class);

            importer = new CKCCImporter(repository, indexManager, directory);
            importer.importAll(false);
        } finally {
            if (importer != null) {
                importer.close();
            }
            LOG.info("Time used: {}", stopWatch);
        }
    }

    // ---------------------------------------------------------------------------

    private static final String VRE_ID = "CKCC";
    private static final String[] TEI_EXTENSIONS = { "xml" };
    private static final String ORGANIZATIONS = "CKCC-organizations.xml";

    private final File inputDir;
    private final LocationConcordance concordance;

    public CKCCImporter(Repository repository, IndexManager indexManager, String inputDirName) throws Exception {
        super(repository, indexManager, VRE_ID);

        inputDir = new File(inputDirName);
        if (inputDir.isDirectory()) {
            System.out.printf("%nImporting from %s%n", inputDir.getCanonicalPath());
        } else {
            System.out.printf("%nNot a directory: %s%n", inputDir.getAbsolutePath());
        }
        concordance = new LocationConcordance(new File(inputDir, "location-concordance.csv"));
    }

    private void importAll(boolean letters) throws Exception {
        try {
            openImportLog("ckcc-log.txt");
            importRelationTypes();
            setupRelationTypeDefs();

            // collectives and persons
            Collection<File> files = FileUtils.listFiles(inputDir, TEI_EXTENSIONS, true);
            for (File file : Sets.newTreeSet(files)) {
                handleXmlFile(file);
            }

            // relations
            new RelationImporter().handleFile(new File(inputDir, "relations.csv"), 0, false);

            // documents and relations
            if (letters) {
                new LetterImporter().handleFile(new File(inputDir, "meta-barl001.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-beec002.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-groo001.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-huyg001.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-huyg003.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-leeu027.csv"), 0, false);
                new LetterImporter().handleFile(new File(inputDir, "meta-nier005.csv"), 0, false);
            }

        } finally {
            displayStatus();
            closeImportLog();
        }
    }

    // ---------------------------------------------------------------------------

    private void handleXmlFile(File file) throws Exception {
        String fileName = file.getName();
        log(".. %s%n", fileName);
        String xml = Files.readTextFromFile(file);
        Visitor visitor = fileName.equals(ORGANIZATIONS) ? new CollectivesVisitor() : new PersonsVisitor();
        Document.createFromXml(xml).accept(visitor);
    }

    private class ImportContext extends XmlContext {
        public CKCCCollective collective;
        public CKCCPerson person;
        public PersonName personName;
    }

    // ---------------------------------------------------------------------------

    /*
     * We have chosen to use the CKCC data "as is".
     * CKCC handles organizations (collectives) as persons, so the visitor must respect this.
     * However, we store these entities as collectives.
     */
    private class CollectivesVisitor extends DelegatingVisitor<ImportContext> {
        public CollectivesVisitor() {
            super(new ImportContext());
            setDefaultElementHandler(new DefaultHandler("listPerson", "occupation", "persName", "sex", "TEI"));
            addElementHandler(new CollectiveHandler(), "person");
            addElementHandler(new CollectiveNameHandler(), "surname");
            addElementHandler(new CollectiveXrefHandler(), "xref");
        }
    }

    private class CollectiveHandler implements ElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            context.collective = new CKCCCollective();
            return Traversal.NEXT;
        }

        @Override
        public Traversal leaveElement(Element element, ImportContext context) {
            try {
                String storedId = addDomainEntity(CKCCCollective.class, context.collective);
                indexManager.addEntity(CKCCCollective.class, storedId);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return Traversal.NEXT;
        }
    }

    private class CollectiveNameHandler extends CaptureHandler<ImportContext> {
        @Override
        public void handleContent(Element element, ImportContext context, String text) {
            context.collective.setName(text);
        }
    }

    private class CollectiveXrefHandler extends CaptureHandler<ImportContext> {
        @Override
        public void handleContent(Element element, ImportContext context, String text) {
            if (element.hasType("CKCC")) {
                context.collective.setUrn(text);
            } else {
                log("## Unknown xref type %s%n", element.getType());
            }
        }
    }

    // ---------------------------------------------------------------------------

    private class PersonsVisitor extends DelegatingVisitor<ImportContext> {
        public PersonsVisitor() {
            super(new ImportContext());
            setDefaultElementHandler(new DefaultHandler("listPerson", "placeName", "TEI"));
            addElementHandler(new PersonHandler(), "person");
            addElementHandler(new PersNameHandler(), "persName");
            addElementHandler(new NameComponentHandler(), "surname", "forename", "roleName", "addName", "nameLink",
                    "genName");
            addElementHandler(new GenderHandler(), "sex");
            addElementHandler(new BirthHandler(), "birth");
            addElementHandler(new DeathHandler(), "death");
            addElementHandler(new FloruitHandler(), "floruit");
            addElementHandler(new LinkHandler(), "link");
            addElementHandler(new PersonXrefHandler(), "xref");
            addElementHandler(new NotesHandler(), "occupation", "relation");
        }
    }

    private class DefaultHandler extends DefaultElementHandler<ImportContext> {
        private final Set<String> ignoredNames;

        public DefaultHandler(String... names) {
            ignoredNames = Sets.newHashSet(names);
        }

        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            String name = element.getName();
            if (!ignoredNames.contains(name)) {
                log("## Unexpected element: %s%n", name);
            }
            return Traversal.NEXT;
        }
    }

    private class PersonHandler implements ElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            context.person = new CKCCPerson();
            return Traversal.NEXT;
        }

        @Override
        public Traversal leaveElement(Element element, ImportContext context) {
            try {
                String storedId = addDomainEntity(CKCCPerson.class, context.person);
                indexManager.addEntity(CKCCPerson.class, storedId);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return Traversal.NEXT;
        }
    }

    private class PersNameHandler implements ElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            context.personName = new PersonName();
            return Traversal.NEXT;
        }

        @Override
        public Traversal leaveElement(Element element, ImportContext context) {
            context.person.addName(context.personName);
            return Traversal.NEXT;
        }
    }

    private class NameComponentHandler extends CaptureHandler<ImportContext> {
        @Override
        public void handleContent(Element element, ImportContext context, String text) {
            if (element.hasName("surname")) {
                context.personName.addNameComponent(PersonNameComponent.Type.SURNAME, text);
            } else if (element.hasName("forename")) {
                context.personName.addNameComponent(PersonNameComponent.Type.FORENAME, text);
            } else if (element.hasName("roleName")) {
                context.personName.addNameComponent(PersonNameComponent.Type.ROLE_NAME, text);
            } else if (element.hasName("addName")) {
                context.personName.addNameComponent(PersonNameComponent.Type.ADD_NAME, text);
            } else if (element.hasName("nameLink")) {
                context.personName.addNameComponent(PersonNameComponent.Type.NAME_LINK, text);
            } else if (element.hasName("genName")) {
                context.personName.addNameComponent(PersonNameComponent.Type.GEN_NAME, text);
            } else {
                log("## Unknown component: %s", element.getName());
            }
        }
    }

    private class GenderHandler extends CaptureHandler<ImportContext> {
        @Override
        public void handleContent(Element element, ImportContext context, String text) {
            if (text.equals("male")) {
                context.person.setGender(Person.Gender.MALE);
            } else if (text.equals("female")) {
                context.person.setGender(Person.Gender.FEMALE);
            } else if (text.equals("not applicable")) {
                context.person.setGender(Person.Gender.NOT_APPLICABLE);
            } else {
                context.person.setGender(Person.Gender.UNKNOWN);
            }
        }
    }

    private class BirthHandler extends DefaultElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            String text = element.getAttribute("when");
            if (!text.isEmpty()) {
                context.person.setBirthDate(new Datable(text));
            }
            return Traversal.NEXT;
        }
    }

    private class DeathHandler extends DefaultElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            String text = element.getAttribute("when");
            if (!text.isEmpty()) {
                context.person.setDeathDate(new Datable(text));
            }
            return Traversal.NEXT;
        }
    }

    private class FloruitHandler extends DefaultElementHandler<ImportContext> {
        @Override
        public Traversal enterElement(Element element, ImportContext context) {
            String start = null;
            String end = null;
            if (element.hasAttribute("when")) {
                start = element.getAttribute("when");
                end = start;
            } else {
                // The use of "notBefore" and "notAfter" is consistent with CKCC,
                // but probably not as how it is intended in the TEI guidelines!
                if (element.hasAttribute("notBefore")) {
                    start = element.getAttribute("notBefore");
                }
                if (element.hasAttribute("notAfter")) {
                    end = element.getAttribute("notAfter");
                }
            }
            context.person.setFloruit(createFloruit(start, end));
            return Traversal.NEXT;
        }

        private FloruitPeriod createFloruit(String start, String end) {
            if (!StringUtils.isBlank(start) && !StringUtils.isBlank(end)) {
                return new FloruitPeriod(start, end);
            } else if (!StringUtils.isBlank(start) && StringUtils.isBlank(end)) {
                return new FloruitPeriod(start);
            } else if (!StringUtils.isBlank(end) && StringUtils.isBlank(start)) {
                return new FloruitPeriod(end);
            } else {
                return null;
            }
        }
    }

    private class LinkHandler extends CaptureHandler<ImportContext> {
        @Override
        protected void handleContent(Element element, ImportContext context, String text) {
            context.person.addLink(new Link(text));
        }
    }

    private class PersonXrefHandler extends CaptureHandler<ImportContext> {
        @Override
        public void handleContent(Element element, ImportContext context, String text) {
            if (element.hasType("CKCC")) {
                context.person.setUrn(text);
            } else if (element.hasType("CEN")) {
                context.person.setCenId(text);
            } else {
                log("## Unknown xref type %s%n", element.getType());
            }
        }
    }

    private class NotesHandler extends CaptureHandler<ImportContext> {
        @Override
        protected void handleContent(Element element, ImportContext context, String text) {
            StringBuilder builder = new StringBuilder();
            Text.appendTo(builder, context.person.getNotes(), "");
            Text.appendTo(builder, text, "; ");
            context.person.setNotes(builder.toString());
        }
    }

    // ---------------------------------------------------------------------------

    public class RelationImporter extends CSVImporter {

        private static final int FIELDS = 3;

        public RelationImporter() {
            super(new PrintWriter(System.err));
        }

        @Override
        protected void handleLine(String[] items) throws Exception {
            if (items.length < FIELDS) {
                throw new ValidationException("Lines must have at least %d items", FIELDS);
            }
            Reference typeRef = relationTypes.get(items[0]);
            RelationType relationType = repository.getRelationTypeById(typeRef.getId(), true);
            Reference sourceRef = getReference(relationType.getSourceTypeName(), items[1]);
            Reference targetRef = getReference(relationType.getTargetTypeName(), items[2]);
            if (typeRef != null && sourceRef != null && targetRef != null) {
                addRelation(CKCCRelation.class, typeRef, sourceRef, targetRef, change, "");
            }
        }

        private Reference getReference(String iname, String urn) {
            if (iname.equalsIgnoreCase("collective")) {
                CKCCCollective entity = repository.findEntity(CKCCCollective.class, "urn", urn);
                if (entity == null) {
                    log("Did not find collective with urn %s%n", urn);
                    return null;
                }
                return new Reference(Collective.class, entity.getId());
            }
            if (iname.equalsIgnoreCase("person")) {
                CKCCPerson entity = repository.findEntity(CKCCPerson.class, "urn", urn);
                if (entity == null) {
                    log("Did not find person with urn %s%n", urn);
                    return null;
                }
                return new Reference(Person.class, entity.getId());
            }
            if (iname.equalsIgnoreCase("location")) {
                Location entity = repository.findEntity(Location.class, "^urn", urn);
                if (entity == null) {
                    log("Did not find location with urn %s%n", urn);
                    return null;
                }
                return new Reference(Location.class, entity.getId());
            }
            log("Unknown type name %s%n", iname);
            return null;
        }
    }

    // ---------------------------------------------------------------------------

    /**
     * Imports metada of letters
     */
    public class LetterImporter extends CSVImporter {

        private static final int FIELDS = 8;

        private static final String URL_PREFIX = "http://ckcc.huygens.knaw.nl/epistolarium/letter.html?id=";

        private final String isCreatedById;
        private final String hasRecipientId;
        private final String hasSenderLocationId;
        private final String hasRecipientLocationId;

        public LetterImporter() {
            super(new PrintWriter(System.err));
            isCreatedById = repository.getRelationTypeByName("isCreatedBy", true).getId();
            hasRecipientId = repository.getRelationTypeByName("hasRecipient", true).getId();
            hasSenderLocationId = repository.getRelationTypeByName("hasSenderLocation", true).getId();
            hasRecipientLocationId = repository.getRelationTypeByName("hasRecipientLocation", true).getId();
        }

        @Override
        protected void handleLine(String[] items) throws Exception {
            if (items.length < FIELDS) {
                throw new ValidationException("Lines must have at least %d items", FIELDS);
            }
            String letterId = items[0];
            String date = items[1];
            String senders = items[2];
            String senderLoc = items[3];
            // String senderLocCertainty = items[4];
            String recipients = items[5];
            String recipientLoc = items[6];
            // String recipientLocCertainty = items[7];

            CKCCDocument document = new CKCCDocument();
            document.setUrn(letterId);
            document.setDate(new Datable(date));
            document.setTitle("Letter " + letterId);
            document.addLink(new Link(URL_PREFIX + letterId));
            String storedId = addDomainEntity(CKCCDocument.class, document);

            if (valid(senders)) {
                for (String sender : Splitter.on('#').split(senders)) {
                    String urn = sender.replace("?", "");
                    CKCCPerson person = repository.findEntity(CKCCPerson.class, "urn", urn);
                    if (person != null) {
                        CKCCRelation relation = RelationBuilder.newInstance(CKCCRelation.class)
                                .withRelationTypeId(isCreatedById)
                                .withSourceType(TypeNames.getInternalName(Document.class)).withSourceId(storedId)
                                .withTargetType(TypeNames.getInternalName(Person.class))
                                .withTargetId(person.getId()).build();
                        addDomainEntity(CKCCRelation.class, relation);
                    } else if (repository.findEntity(CKCCCollective.class, "urn", urn) == null) {
                        System.out.printf("%s: failed to find sender %s%n", letterId, urn);
                    }
                }
            }

            if (valid(senderLoc)) {
                String urn = concordance.convert(senderLoc.replace("?", ""));
                if (!urn.equals("?")) {
                    Location location = repository.findEntity(Location.class, "^urn", urn);
                    if (location != null) {
                        CKCCRelation relation = RelationBuilder.newInstance(CKCCRelation.class)
                                .withRelationTypeId(hasSenderLocationId)
                                .withSourceType(TypeNames.getInternalName(Document.class)).withSourceId(storedId)
                                .withTargetType(TypeNames.getInternalName(Location.class))
                                .withTargetId(location.getId()).build();
                        addDomainEntity(CKCCRelation.class, relation);
                    } else {
                        System.out.printf("%s: failed to find sender location %s (mapped to %s)%n", letterId,
                                senderLoc, urn);
                    }
                }
            }

            if (valid(recipients)) {
                for (String recipient : Splitter.on('#').split(recipients)) {
                    String urn = recipient.replace("?", "");
                    CKCCPerson person = repository.findEntity(CKCCPerson.class, "urn", urn);
                    if (person != null) {
                        CKCCRelation relation = RelationBuilder.newInstance(CKCCRelation.class)
                                .withRelationTypeId(hasRecipientId)
                                .withSourceType(TypeNames.getInternalName(Document.class)).withSourceId(storedId)
                                .withTargetType(TypeNames.getInternalName(Person.class))
                                .withTargetId(person.getId()).build();
                        addDomainEntity(CKCCRelation.class, relation);
                    } else if (repository.findEntity(CKCCCollective.class, "urn", urn) == null) {
                        System.out.printf("%s: failed to find recipient %s%n", letterId, urn);
                    }
                }
            }

            if (valid(recipientLoc)) {
                String urn = concordance.convert(recipientLoc.replace("?", ""));
                if (!urn.equals("?")) {
                    Location location = repository.findEntity(Location.class, "^urn", urn);
                    if (location != null) {
                        CKCCRelation relation = RelationBuilder.newInstance(CKCCRelation.class)
                                .withRelationTypeId(hasRecipientLocationId)
                                .withSourceType(TypeNames.getInternalName(Document.class)).withSourceId(storedId)
                                .withTargetType(TypeNames.getInternalName(Location.class))
                                .withTargetId(location.getId()).build();
                        addDomainEntity(CKCCRelation.class, relation);
                    } else {
                        System.out.printf("%s: failed to find recipient location %s (mapped to %s)%n", letterId,
                                recipientLoc, urn);
                    }
                }
            }
        }

        private boolean valid(String text) {
            return !text.isEmpty() && !text.equals("?");
        }
    }

}