nl.knaw.huygens.alexandria.lmnl.importer.LMNLImporter.java Source code

Java tutorial

Introduction

Here is the source code for nl.knaw.huygens.alexandria.lmnl.importer.LMNLImporter.java

Source

package nl.knaw.huygens.alexandria.lmnl.importer;

/*
 * #%L
 * alexandria-markup
 * =======
 * Copyright (C) 2016 - 2018 HuC DI (KNAW)
 * =======
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import nl.knaw.huygens.alexandria.ErrorListener;
import nl.knaw.huygens.alexandria.lmnl.grammar.LMNLLexer;
import nl.knaw.huygens.alexandria.storage.TAGAnnotation;
import nl.knaw.huygens.alexandria.storage.TAGDocument;
import nl.knaw.huygens.alexandria.storage.TAGMarkup;
import nl.knaw.huygens.alexandria.storage.TAGStore;
import nl.knaw.huygens.alexandria.storage.dto.*;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.Token;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.util.*;
import java.util.stream.Collectors;

public class LMNLImporter {
    private static final Logger LOG = LoggerFactory.getLogger(LMNLImporter.class);

    private static TAGStore tagStore;

    public LMNLImporter(TAGStore tagStore) {
        LMNLImporter.tagStore = tagStore;
    }

    static class DocumentContext {
        private final TAGDocumentDTO document;
        private final Deque<TAGMarkupDTO> openMarkupDeque = new ArrayDeque<>();
        private final Stack<TAGMarkupDTO> openMarkupStack = new Stack<>();
        private final Stack<TAGAnnotationDTO> annotationStack = new Stack<>();
        private final ImporterContext importerContext;

        DocumentContext(TAGDocumentDTO document, ImporterContext importerContext) {
            this.document = document;
            this.importerContext = importerContext;
        }

        void openMarkup(TAGMarkupDTO markup) {
            openMarkupDeque.push(markup);
            openMarkupStack.push(markup);
            document.getMarkupIds().add(markup.getDbId());
        }

        void pushOpenMarkup(String rangeName) {
            // LOG.info("currentDocumentContext().openMarkupDeque={}", openMarkupDeque.stream().map(Markup::getKey).collect(Collectors.toList()));
            Optional<TAGMarkup> findFirst = openMarkupDeque.stream()//
                    .map(dto -> new TAGMarkup(tagStore, dto)).filter(m -> m.getExtendedTag().equals(rangeName))//
                    .findFirst();
            if (findFirst.isPresent()) {
                TAGMarkup markup = findFirst.get();
                if (!document.markupHasTextNodes(markup)) {
                    // every markup should have at least one textNode
                    TAGTextNodeDTO emptyTextNode = new TAGTextNodeDTO("");
                    update(emptyTextNode);
                    addTextNode(emptyTextNode);
                    closeMarkup();
                }
                openMarkupStack.push(markup.getDTO());
            } else {
                importerContext.errors.add("Closing tag {" + rangeName + "] found without corresponding open tag.");
            }
        }

        void popOpenMarkup() {
            openMarkupStack.pop();
        }

        void closeMarkup() {
            if (!openMarkupStack.isEmpty()) {
                TAGMarkupDTO markup = openMarkupStack.pop();
                update(markup);
                openMarkupDeque.remove(markup);
            }
        }

        void addTextNode(TAGTextNodeDTO textNode) {
            openMarkupDeque.descendingIterator()//
                    .forEachRemaining(m -> {
                        //            m.addTextNode(textNode);
                        document.associateTextWithMarkupForLayer(textNode, m, "");
                    });
            if (!document.hasTextNodes()) {
                document.setFirstTextNodeId(textNode.getDbId());
            }
            document.addTextNode(textNode);
        }

        private TAGMarkupDTO currentMarkup() {
            return openMarkupDeque.isEmpty() ? null : openMarkupStack.peek();
        }

        void openAnnotation(TAGAnnotationDTO annotation) {
            if (annotationStack.isEmpty()) {
                TAGMarkupDTO markup = currentMarkup();
                if (markup != null) {
                    //          markup.addAnnotation(annotation);
                }
            } else {
                //        annotationStack.peek().addAnnotation(annotation);
            }
            annotationStack.push(annotation);
        }

        TAGDocumentDTO currentAnnotationDocument() {
            return null;
            //      Long value = annotationStack.peek().getDocumentId();
            //      return tagStore.getDocumentDTO(value);
        }

        void closeAnnotation() {
            TAGAnnotationDTO annotation = annotationStack.pop();
            update(annotation);
        }
    }

    static class ImporterContext {
        private final Deque<DocumentContext> documentContextStack = new ArrayDeque<>();
        private final LMNLLexer lexer;
        private final List<String> errors = new ArrayList<>();

        ImporterContext(LMNLLexer lexer) {
            this.lexer = lexer;
        }

        Token nextToken() {
            return lexer.nextToken();
        }

        String getModeName() {
            return lexer.getModeNames()[lexer._mode];
        }

        String getRuleName() {
            return lexer.getRuleNames()[lexer.getToken().getType() - 1];
        }

        void pushDocumentContext(TAGDocumentDTO document) {
            documentContextStack.push(new DocumentContext(document, this));
        }

        DocumentContext currentDocumentContext() {
            return documentContextStack.peek();
        }

        DocumentContext popDocumentContext() {
            DocumentContext documentContext = documentContextStack.pop();
            update(documentContext.document);
            if (!documentContext.openMarkupDeque.isEmpty()) {
                String openRanges = documentContext.openMarkupDeque.stream()//
                        .map(dto -> new TAGMarkup(tagStore, dto)).map(m -> "[" + m.getExtendedTag() + "}")//
                        .collect(Collectors.joining(", "));
                errors.add("Unclosed LMNL range(s): " + openRanges);
            }
            return documentContext;
        }

        TAGMarkupDTO newMarkup(String tagName) {
            TAGMarkupDTO tagMarkupDTO = new TAGMarkupDTO(currentDocumentContext().document.getDbId(), tagName);
            update(tagMarkupDTO);
            return tagMarkupDTO;
        }

        void openMarkup(TAGMarkupDTO markup) {
            currentDocumentContext().openMarkup(markup);
        }

        void pushOpenMarkup(String rangeName) {
            currentDocumentContext().pushOpenMarkup(rangeName);
        }

        void popOpenMarkup() {
            currentDocumentContext().popOpenMarkup();
        }

        void closeMarkup() {
            currentDocumentContext().closeMarkup();
        }

        void addTextNode(TAGTextNodeDTO textNode) {
            currentDocumentContext().addTextNode(textNode);
        }

        void openAnnotation(TAGAnnotationDTO annotation) {
            currentDocumentContext().openAnnotation(annotation);
        }

        TAGDocumentDTO currentAnnotationDocument() {
            return currentDocumentContext().currentAnnotationDocument();
        }

        void closeAnnotation() {
            currentDocumentContext().closeAnnotation();
        }

        List<String> getErrors() {
            return errors;
        }

        boolean hasErrors() {
            return !errors.isEmpty();
        }
    }

    public TAGDocument importLMNL(String input) throws LMNLSyntaxError {
        CharStream antlrInputStream = CharStreams.fromString(input);
        return importLMNL(antlrInputStream);
    }

    public TAGDocument importLMNL(InputStream input) throws LMNLSyntaxError {
        try {
            CharStream antlrInputStream = CharStreams.fromStream(input);
            return importLMNL(antlrInputStream);
        } catch (IOException e) {
            e.printStackTrace();
            throw new UncheckedIOException(e);
        }
    }

    private TAGDocument importLMNL(CharStream antlrInputStream) throws LMNLSyntaxError {
        LMNLLexer lexer = new LMNLLexer(antlrInputStream);
        ErrorListener errorListener = new ErrorListener();
        lexer.addErrorListener(errorListener);

        ImporterContext context = new ImporterContext(lexer);
        TAGDocument document = tagStore.createDocument();
        TAGDocumentDTO dto = document.getDTO();
        update(dto);
        context.pushDocumentContext(dto);
        handleDefaultMode(context);
        joinDiscontinuedRanges(document);
        context.popDocumentContext();

        String errorMsg = "";
        if (context.hasErrors()) {
            String errors = String.join("\n", context.getErrors());
            errorMsg = "Parsing errors:\n" + errors;
        }
        if (errorListener.hasErrors()) {
            String errors = String.join("\n", errorListener.getErrors());
            errorMsg += "\n\nTokenizing errors:\n" + errors;
        }
        if (!errorMsg.isEmpty()) {
            throw new LMNLSyntaxError(errorMsg);
        }
        update(dto);
        return document;
    }

    private void handleDefaultMode(ImporterContext context) {
        String methodName = "defaultMode";
        Token token;
        do {
            token = context.nextToken();
            if (token.getType() != Token.EOF) {
                String ruleName = context.getRuleName();
                String modeName = context.getModeName();
                log(methodName, ruleName, modeName, token, context);
                switch (token.getType()) {
                case LMNLLexer.BEGIN_OPEN_RANGE:
                    handleOpenRange(context);
                    break;

                case LMNLLexer.BEGIN_CLOSE_RANGE:
                    handleCloseRange(context);
                    break;

                case LMNLLexer.TEXT:
                    TAGTextNodeDTO textNode = new TAGTextNodeDTO(token.getText());
                    update(textNode);
                    context.addTextNode(textNode);
                    break;

                // case LMNLLexer.TagOpenStartChar:
                // case LMNLLexer.TagOpenEndChar:
                // case LMNLLexer.TagCloseStartChar:
                // case LMNLLexer.TagCloseEndChar:
                // break;

                default:
                    handleUnexpectedToken(methodName, token, ruleName, modeName);
                    break;
                }
            }
        } while (token.getType() != Token.EOF);
    }

    private void handleOpenRange(ImporterContext context) {
        String methodName = "handleOpenRange";
        boolean goOn = true;
        while (goOn) {
            Token token = context.nextToken();
            String ruleName = context.getRuleName();
            String modeName = context.getModeName();
            log(methodName, ruleName, modeName, token, context);
            switch (token.getType()) {
            case LMNLLexer.Name_Open_Range:
                TAGMarkupDTO markup = context.newMarkup(token.getText());
                context.openMarkup(markup);
                break;
            case LMNLLexer.BEGIN_OPEN_ANNO:
                handleAnnotation(context);
                break;
            case LMNLLexer.END_OPEN_RANGE:
                context.popOpenMarkup();
                goOn = false;
                break;
            case LMNLLexer.END_ANONYMOUS_RANGE:
                TAGTextNodeDTO textNode = new TAGTextNodeDTO("");
                update(textNode);
                context.addTextNode(textNode);
                context.closeMarkup();
                goOn = false;
                break;

            // case LMNLLexer.TagOpenStartChar:
            // case LMNLLexer.TagOpenEndChar:
            // case LMNLLexer.TagCloseStartChar:
            // case LMNLLexer.TagCloseEndChar:
            // break;

            default:
                handleUnexpectedToken(methodName, token, ruleName, modeName);
                break;
            }
            goOn = goOn && token.getType() != Token.EOF;
        }
    }

    private void handleAnnotation(ImporterContext context) {
        String methodName = "handleAnnotation";
        //    TAGAnnotationDTO annotation = tagStore.createAnnotationDTO("");
        //    context.openAnnotation(annotation);
        //    boolean goOn = true;
        //    while (goOn) {
        //      Token token = context.nextToken();
        //      String ruleName = context.getRuleName();
        //      String modeName = context.getModeName();
        //      log(methodName, ruleName, modeName, token, context);
        //      switch (token.getType()) {
        //        case LMNLLexer.Name_Open_Annotation:
        //          annotation.setKey(token.getText());
        //          break;
        //        case LMNLLexer.OPEN_ANNO_IN_ANNO_OPENER:
        //        case LMNLLexer.OPEN_ANNO_IN_ANNO_CLOSER:
        //          handleAnnotation(context);
        //          break;
        //        case LMNLLexer.END_OPEN_ANNO:
        //          context.pushDocumentContext(context.currentAnnotationDocument());
        //          break;
        //
        //        case LMNLLexer.ANNO_TEXT:
        //          TAGTextNodeDTO textNode = new TAGTextNodeDTO(token.getText());
        //          update(textNode);
        //          context.addTextNode(textNode);
        //          break;
        //
        //        case LMNLLexer.BEGIN_ANNO_OPEN_RANGE:
        //          handleOpenRange(context);
        //          break;
        //
        //        case LMNLLexer.BEGIN_ANNO_CLOSE_RANGE:
        //          handleCloseRange(context);
        //          break;
        //
        //        case LMNLLexer.BEGIN_CLOSE_ANNO:
        //        case LMNLLexer.Name_Close_Annotation:
        //          break;
        //        case LMNLLexer.END_CLOSE_ANNO:
        //          context.popDocumentContext();
        //        case LMNLLexer.END_EMPTY_ANNO:
        //          context.closeAnnotation();
        //          goOn = false;
        //          break;
        //
        //        // case LMNLLexer.TagOpenStartChar:
        //        // case LMNLLexer.TagOpenEndChar:
        //        // case LMNLLexer.TagCloseStartChar:
        //        // case LMNLLexer.TagCloseEndChar:
        //        // break;
        //
        //        default:
        //          handleUnexpectedToken(methodName, token, ruleName, modeName);
        //          break;
        //      }
        //      goOn = goOn && token.getType() != Token.EOF;
        //    }
    }

    private void handleCloseRange(ImporterContext context) {
        String methodName = "handleCloseRange";
        boolean goOn = true;
        while (goOn) {
            Token token = context.nextToken();
            String ruleName = context.getRuleName();
            String modeName = context.getModeName();
            log(methodName, ruleName, modeName, token, context);
            switch (token.getType()) {
            case LMNLLexer.Name_Close_Range:
                String rangeName = token.getText();
                context.pushOpenMarkup(rangeName);
                break;
            case LMNLLexer.BEGIN_OPEN_ANNO_IN_RANGE_CLOSER:
                handleAnnotation(context);
                break;
            case LMNLLexer.END_CLOSE_RANGE:
                context.closeMarkup();
                goOn = false;
                break;

            // case LMNLLexer.TagOpenStartChar:
            // case LMNLLexer.TagOpenEndChar:
            // case LMNLLexer.TagCloseStartChar:
            // case LMNLLexer.TagCloseEndChar:
            // break;

            default:
                handleUnexpectedToken(methodName, token, ruleName, modeName);
                break;
            }
            goOn = goOn && token.getType() != Token.EOF;
        }
    }

    private void handleUnexpectedToken(String methodName, Token token, String ruleName, String modeName) {
        String message = methodName + ": unexpected rule/token: token=" + token + ", ruleName=" + ruleName
                + ", mode=" + modeName;
        LOG.error(message);
        throw new LMNLSyntaxError(message);
    }

    //  private static void joinDiscontinuedRanges(Document document) {
    //    joinDiscontinuedRanges(document.getDocumentId());
    //  }

    private static void joinDiscontinuedRanges(TAGDocument document) {
        //    Map<String, TAGMarkupDTO> markupsToJoin = new HashMap<>();
        //    List<Long> markupIdsToRemove = new ArrayList<>();
        //    document.getMarkupStream()//
        //        .filter(TAGMarkup::hasN)//
        //        .forEach(markup -> {
        //          String tag = markup.getTag();
        //          AnnotationInfo annotation = markup.getAnnotationStream()//
        //              .filter(a -> a.getName().equals("n"))//
        //              .findFirst()//
        //              .get();
        //          String key = tag + "-" + annotationText(annotation);
        //          if (markupsToJoin.containsKey(key)) {
        //            TAGMarkupDTO originalMarkup = markupsToJoin.get(key);
        //            markup.getDTO().getAnnotationIds().remove(annotation.getDbId());
        ////            document.joinMarkup(originalMarkup, markup);
        //            markupIdsToRemove.add(markup.getDbId());
        //          } else {
        //            markupsToJoin.put(key, markup.getDTO());
        //          }
        //        });
        //
        //    document.getDTO().getMarkupIds().removeAll(markupIdsToRemove);
        ////    document.getMarkupStream()//
        ////        .map(TAGMarkup::getAnnotationStream)//
        ////        .flatMap(Function.identity())//
        ////        .map(TAGAnnotation::getDocument)//
        ////        .forEach(LMNLImporter::joinDiscontinuedRanges);
    }

    private static String annotationText(TAGAnnotation annotation) {
        return "TODO";
        //    return annotation.getDocument().getTextNodeStream()//
        //        .map(TAGTextNode::getText)//
        //        .collect(joining());
    }

    private void log(String mode, String ruleName, String modeName, Token token, ImporterContext context) {
        // LOG.info("{}:\tlevel:{}, <{}> :\t{} ->\t{}", //
        // mode, context.limenContextStack.size(), //
        // token.getText().replace("\n", "\\n"), //
        // ruleName, modeName);
    }

    private static Long update(TAGDTO tagdto) {
        return tagStore.persist(tagdto);
    }

}