com.tekstosense.segmenter.Rule.SectionRules.java Source code

Java tutorial

Introduction

Here is the source code for com.tekstosense.segmenter.Rule.SectionRules.java

Source

/*******************************************************************************
 * Copyright (c) 2016, TekstoSense and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.tekstosense.segmenter.Rule;

import com.google.common.base.Preconditions;
import com.tekstosense.segmenter.model.DoublyLinkedList;
import com.tekstosense.segmenter.model.LineNode;
import com.tekstosense.segmenter.model.Node;
import com.tekstosense.segmenter.model.Section;

import org.apache.commons.lang3.math.NumberUtils;

import java.util.ArrayList;
import java.util.List;

import static com.tekstosense.segmenter.Rule.HeadingRule.*;
import static com.tekstosense.segmenter.model.LinkedNGramUtil.buildNGram;

/**
 * @author TekstoSense
 */
public class SectionRules {

    private static final float LINE_SPACING = 5.0F;
    private static final float DESCENT_CONSTANT = 1F;
    private static final String REFERENCE = "Reference";

    private DoublyLinkedList<LineNode> lineNodeDoublyLinkedList;
    private List<Section> sections;
    private Section currentSection;

    public SectionRules(DoublyLinkedList<LineNode> lineNodeDoublyLinkedList) {
        this.lineNodeDoublyLinkedList = lineNodeDoublyLinkedList;
    }

    public List<Section> getSections() {
        return sections;
    }

    public void applyDefaultRule() {
        Preconditions.checkNotNull(this.lineNodeDoublyLinkedList, "LinkedList can not be null");
        // mergerLines(this.lineNodeDoublyLinkedList);

        iterateLinkedList(nGramMerge(mergerLines(this.lineNodeDoublyLinkedList)));
    }

    private DoublyLinkedList<LineNode> nGramMerge(DoublyLinkedList<LineNode> lineNodeDoublyLinkedList) {
        Node<LineNode> startNode = lineNodeDoublyLinkedList.getFirst();
        DoublyLinkedList<LineNode> finalLinkedList = new DoublyLinkedList<>();
        nGramM(startNode, finalLinkedList, 0);
        return finalLinkedList;
    }

    private void nGramM(Node<LineNode> startNode, DoublyLinkedList<LineNode> finalLinkedList, int mergeItr) {
        Node<LineNode>[] nGrams = buildNGram(startNode, 3);
        if (startNode != null) {
            if (isInCorrectOrder(nGrams)) {
                if (mergeItr > 0) {
                    Node<LineNode> newNode = mergeNGram(mergeItr, startNode);
                    finalLinkedList.addToLast(newNode.getData());
                } else {
                    finalLinkedList.addToLast(startNode.getData());
                }
                nGramM(startNode.getNext(), finalLinkedList, 0);
            } else {
                nGramM(startNode.getNext(), finalLinkedList, ++mergeItr);
            }
        }
    }

    private Node<LineNode> mergeNGram(int mergeItr, Node<LineNode> currentNode) {

        String[] strings = new String[mergeItr + 1];
        int count = strings.length;
        StringBuilder stringBuilder = new StringBuilder();
        if (currentNode != null) {

            while (mergeItr > 0) {
                strings[count - 1] = currentNode.getData().getText();
                currentNode = currentNode.getPrev();
                count--;
                mergeItr--;
            }

            for (int i = 1; i < strings.length; i++) {
                stringBuilder.append(" ").append(strings[i]);
            }
            currentNode.getData().setText(currentNode.getData() + " " + stringBuilder.toString());
        }
        return currentNode;
    }

    private boolean isInCorrectOrder(Node<LineNode>[] nGrams) {
        for (int i = 1; i < nGrams.length; i++) {
            if (nGrams[i - 1] != null && nGrams[i] != null) {
                if ((nGrams[i - 1].getData().getPage() == nGrams[i].getData().getPage()
                        && (nGrams[i - 1].getData().getTop() > nGrams[i].getData().getTop())))
                    return false;
            }
        }
        return true;
    }

    private DoublyLinkedList<LineNode> mergerLines(DoublyLinkedList<LineNode> lineNodeDoublyLinkedList) {
        Node<LineNode> lineNodeNode = lineNodeDoublyLinkedList.getFirst();
        DoublyLinkedList<LineNode> finalLinkedList = new DoublyLinkedList<>();
        merge(lineNodeNode, finalLinkedList, null);
        return finalLinkedList;
    }

    private void merge(Node<LineNode> lineNodeNode, DoublyLinkedList<LineNode> finalLinkedList,
            Node<LineNode> currentNode) {
        if (lineNodeNode != null) {
            if (isPartOfPreviousLine(lineNodeNode)) {
                if (currentNode == null) {
                    currentNode = lineNodeNode;
                    merge(lineNodeNode.getNext(), finalLinkedList, currentNode);
                } else {
                    currentNode.getData().setText(
                            currentNode.getData().getText() + " " + ((LineNode) lineNodeNode.getData()).getText());
                    merge(lineNodeNode.getNext(), finalLinkedList, currentNode);
                }
            } else {
                finalLinkedList.addToLast(lineNodeNode.getData());
                merge(lineNodeNode.getNext(), finalLinkedList, finalLinkedList.getLast());
            }
        }
    }

    private void iterateLinkedList(DoublyLinkedList<LineNode> lineNodeDoublyLinkedList) {
        Node<LineNode> lineNodeNode = lineNodeDoublyLinkedList.getFirst();
        parseNode(lineNodeNode);
    }

    private void parseNode(Node<LineNode> lineNodeNode) {
        if (isHeader(lineNodeNode)) {
            if (isReallyAHeader(lineNodeNode))
                createSection(lineNodeNode);
            else
                addToCurrentSection(lineNodeNode);
        } else {
            addToCurrentSection(lineNodeNode);
        }
        if (lineNodeNode.getNext() != null)
            parseNode(lineNodeNode.getNext());

    }

    private boolean isReallyAHeader(Node<LineNode> lineNodeNode) {
        if (isSingleChar(lineNodeNode.getData().getText()) || hasSpecialCharacters(lineNodeNode)
                || hasMoreNumbers(lineNodeNode.getData().getText())
                || hasMoreSpecialChars(lineNodeNode.getData().getText())
                || hasOnlyNumber(lineNodeNode.getData().getText())
                || endsWithSpecialChar(lineNodeNode.getData().getText())
                || !hasBracketPair(lineNodeNode.getData().getText())
                || !hasSingleClosingBracket(lineNodeNode.getData().getText())
                || hasOnlySpecialChar(lineNodeNode.getData().getText())) {
            return false;
        }

        if (isPartOfPreviousLine(lineNodeNode) && currentSection != null) {
            if (currentSection.getLines() != null && currentSection.getLines().getSize() > 0) {
                addToCurrentSection(lineNodeNode);
                return false;
            } else if (!currentSection.getHeading().getText().isEmpty()) {
                createSection(lineNodeNode);
                return false;
            }
        }
        return true;
    }

    private boolean hasSpecialCharacters(Node<LineNode> lineNodeNode) {

        if (lineNodeNode.getData().getFamily().equalsIgnoreCase("symbol")
                || lineNodeNode.getData().getFamily().equalsIgnoreCase("CMMI9")
                || lineNodeNode.getData().getFamily().equalsIgnoreCase("CMMI6")
                || NumberUtils.isNumber(lineNodeNode.getData().getText())) {
            return true;
        }
        return false;
    }

    private void addToCurrentSection(Node<LineNode> lineNodeNode) {
        if (sections == null) {
            sections = new ArrayList<>();
        }
        if (currentSection == null) {
            currentSection = new Section();
            sections.add(currentSection);
        }
        if (isPartOfPreviousLine(lineNodeNode)) {
            if (currentSection.getLines() != null && currentSection.getLines().getLast() != null) {
                String updatedText = ((LineNode) currentSection.getLines().getLast().getData()).getText()
                        + lineNodeNode.getData().getText();
                ((LineNode) currentSection.getLines().getLast().getData()).setText(updatedText);
                ((LineNode) currentSection.getLines().getLast().getData()).setTop(lineNodeNode.getData().getTop());
            } else {
                currentSection.addLine(lineNodeNode.getData());
            }
        } else {
            currentSection.addLine(lineNodeNode.getData());
        }
    }

    private boolean isPartOfPreviousLine(Node<LineNode> lineNodeNode) {
        if (lineNodeNode.getPrev() != null)
            return (Math.abs((lineNodeNode.getData().getTop()
                    - ((LineNode) lineNodeNode.getPrev().getData()).getTop())) < LINE_SPACING);
        return false;
    }

    private boolean isHeader(Node<LineNode> lineNodeNode) {
        float currentSize = lineNodeNode.getData().getSize();
        float currentDesent = lineNodeNode.getData().getDescent();
        float prevSize = 0;
        float nextSize = 0;
        float nextDesent = 0;
        if (lineNodeNode.getPrev() != null)
            prevSize = ((LineNode) lineNodeNode.getPrev().getData()).getSize();
        if (lineNodeNode.getNext() != null) {
            nextSize = ((LineNode) lineNodeNode.getNext().getData()).getSize();
            nextDesent = ((LineNode) lineNodeNode.getNext().getData()).getDescent();
        }

        if (isFirstPage(lineNodeNode)) {
            return (prevSize <= currentSize && currentSize > nextSize && hasCorrectDescent(lineNodeNode));
        } else {
            String currentFamily = lineNodeNode.getData().getFamily();
            String prevFamily = "";
            String nextFamily = "";
            if (lineNodeNode.getPrev() != null)
                prevFamily = ((LineNode) lineNodeNode.getPrev().getData()).getFamily();
            if (lineNodeNode.getNext() != null)
                nextFamily = ((LineNode) lineNodeNode.getNext().getData()).getFamily();
            return ((prevSize < currentSize && currentSize > nextSize && hasCorrectDescent(lineNodeNode))
                    || ((!currentFamily.equalsIgnoreCase(prevFamily) && !currentFamily.equalsIgnoreCase(nextFamily))
                            && !NumberUtils.isNumber(lineNodeNode.getData().getText())
                            && hasCorrectDescent(lineNodeNode)));
        }
    }

    private boolean hasCorrectDescent(Node<LineNode> lineNodeNode) {
        float currentDesent = lineNodeNode.getData().getDescent();
        float nextDesent = 0;
        if (lineNodeNode.getNext() != null)
            nextDesent = ((LineNode) lineNodeNode.getNext().getData()).getDescent();

        float diff = Math.abs(currentDesent - nextDesent);
        return diff < DESCENT_CONSTANT;
    }

    private boolean isFirstPage(Node<LineNode> lineNodeNode) {
        return lineNodeNode.getData().getPage() == 1;
    }

    private void createSection(Node<LineNode> lineNodeNode) {
        if (sections == null) {
            sections = new ArrayList<>();
        }

        if (!isPartOfPreviousLine(lineNodeNode) && !isCurrentORAfterReference(sections, currentSection)) {
            Section section = new Section();
            section.setHeading(lineNodeNode.getData());
            if (lineNodeNode.getData().getText().toLowerCase().indexOf(REFERENCE.toLowerCase()) > -1) {
                section.setReference(true);
            }
            currentSection = section;
            sections.add(currentSection);
        } else {
            String updatedText = ((LineNode) currentSection.getHeading()).getText()
                    + lineNodeNode.getData().getText();
            if (!isCurrentORAfterReference(sections, currentSection))
                currentSection.getHeading().setText(updatedText);
            currentSection.getHeading().setTop(lineNodeNode.getData().getTop());
        }

        /*
         * if (isPartOfPreviousLine(lineNodeNode) || isAfterReference(sections,
         * currentSection)) { String updatedText = ((LineNode)
         * currentSection.getHeading()).getText() +
         * lineNodeNode.getData().getText();
         * currentSection.getHeading().setText(updatedText);
         * currentSection.getHeading().setTop(lineNodeNode.getData().getTop());
         * } else { Section section = new Section();
         * section.setHeading(lineNodeNode.getData()); if
         * (lineNodeNode.getData()
         * .getText().toLowerCase().indexOf(REFERENCE.toLowerCase()) > -1) {
         * section.setReference(true); } currentSection = section;
         * sections.add(currentSection); }
         */
    }
}