org.trnltk.apps.tokenizer.TextTokenizerDefaultTrainingApp.java Source code

Introduction

Here is the source code for org.trnltk.apps.tokenizer.TextTokenizerDefaultTrainingApp.java
Source

/*
 * Copyright  2013  Ali Ok (aliokATapacheDOTorg)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.trnltk.apps.tokenizer;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import org.junit.runner.RunWith;
import org.trnltk.apps.commons.App;
import org.trnltk.apps.commons.AppRunner;
import org.trnltk.tokenizer.*;

import java.util.Map;

@RunWith(AppRunner.class)
public class TextTokenizerDefaultTrainingApp extends TextTokenizerDefaultTrainingTest {

    @App
    public void dumpBigTokenizationGraphInDotFormat() {
        final TextTokenizer tokenizer = TextTokenizer.createDefaultTextTokenizer(true);
        dumpTokenizationGraph(tokenizer.getGraph(), Predicates.<TokenizationGraphNode>alwaysTrue(),
                Predicates.<TokenizationGraphNode>alwaysTrue(), Predicates.<TokenizationGraphEdge>alwaysTrue());
    }

    @App
    public void dumpSomePortionOfBigTokenizationGraphInDotFormat() {
        final TextTokenizer tokenizer = TextTokenizer.createDefaultTextTokenizer(true);
        final Predicate<TokenizationGraphNode> Type_Word_Matcher = new Predicate<TokenizationGraphNode>() {
            @Override
            public boolean apply(TokenizationGraphNode input) {
                final ImmutableList<TextBlockType> textBlockTypes = input.getData().getTextBlockTypes();
                if (textBlockTypes.iterator().next().equals(TextBlockType.Word)) {
                    return true;
                } else {
                    return false;
                }
            }
        };
        dumpTokenizationGraph(tokenizer.getGraph(), Type_Word_Matcher, Type_Word_Matcher,
                Predicates.<TokenizationGraphEdge>alwaysTrue());
    }

    @App
    public void dumpSmallTokenizationGraphInDotFormat() {
        final TextTokenizerTrainer localTokenizer = new TextTokenizerTrainer(2, true);
        localTokenizer.train("ali veli.", "ali veli .");

        dumpTokenizationGraph(localTokenizer.build(), Predicates.<TokenizationGraphNode>alwaysTrue(),
                Predicates.<TokenizationGraphNode>alwaysTrue(), Predicates.<TokenizationGraphEdge>alwaysTrue());
    }

    @App
    public void dumpSmallTokenizationGraphInDotFormatWithoutInference() {
        final TextTokenizerTrainer localTokenizer = new TextTokenizerTrainer(2, true);
        localTokenizer.train("ali geldi.", "ali geldi .");

        Predicate<TokenizationGraphEdge> directEdgePredicate = new Predicate<TokenizationGraphEdge>() {
            @Override
            public boolean apply(TokenizationGraphEdge input) {
                return !input.isInferred();
            }
        };
        dumpTokenizationGraph(localTokenizer.build(), Predicates.<TokenizationGraphNode>alwaysTrue(),
                Predicates.<TokenizationGraphNode>alwaysTrue(), directEdgePredicate);
    }

    @App
    public void dumpSmallTokenizationGraph_onlyForWordsAndDot_InDotFormat() {
        final TextTokenizerTrainer localTokenizer = new TextTokenizerTrainer(2, true);
        localTokenizer.train("ali veli.", "ali veli .");

        dumpTokenizationGraph(localTokenizer.build(), new Predicate<TokenizationGraphNode>() {
            @Override
            public boolean apply(TokenizationGraphNode input) {
                final ImmutableList<TextBlockType> types = input.getData().getTextBlockTypes();
                if (types.contains(TextBlockType.Abbreviation) || types.get(1).equals(TextBlockType.Sentence_Start))
                    return false;
                return true;
            }
        },

                new Predicate<TokenizationGraphNode>() {
                    @Override
                    public boolean apply(TokenizationGraphNode input) {
                        final ImmutableList<TextBlockType> types = input.getData().getTextBlockTypes();
                        if (types.contains(TextBlockType.Abbreviation)
                                || types.get(0).equals(TextBlockType.Sentence_End)
                                || types.get(1).equals(TextBlockType.Dot))
                            return false;
                        return true;
                    }
                },

                Predicates.<TokenizationGraphEdge>alwaysTrue());
    }

    //see http://en.wikipedia.org/wiki/DOT_language
    protected void dumpTokenizationGraph(TokenizationGraph graph,
            Predicate<TokenizationGraphNode> sourceNodePredicate,
            Predicate<TokenizationGraphNode> targetNodePredicate, Predicate<TokenizationGraphEdge> edgePredicate) {
        final Map<TextBlockTypeGroup, TokenizationGraphNode> nodeMap = graph.getNodeMap();

        int instructedEdgeCount = 0;
        int inferredEdgeCount = 0;

        int addSpaceRuleCount = 0;
        int dontAddSpaceRuleCount = 0;

        System.out.println("digraph tokenizationGraph {");

        for (Map.Entry<TextBlockTypeGroup, TokenizationGraphNode> nodeEntry : nodeMap.entrySet()) {
            //all nodes are available on the map, so we don't need graph traversal
            final TokenizationGraphNode sourceNode = nodeEntry.getValue();

            if (!sourceNodePredicate.apply(sourceNode))
                continue;

            final String sourceNodeName = getNodeName(sourceNode);
            System.out.println("\t" + sourceNodeName);

            final Map<TextBlockTypeGroup, TokenizationGraphEdge> edges = sourceNode.getEdges();
            for (TokenizationGraphEdge tokenizationGraphEdge : edges.values()) {
                final TokenizationGraphNode targetNode = tokenizationGraphEdge.getTarget();
                if (!targetNodePredicate.apply(targetNode))
                    continue;

                if (!edgePredicate.apply(tokenizationGraphEdge))
                    continue;

                String style = tokenizationGraphEdge.isInferred() ? "dashed" : "solid";
                String color = tokenizationGraphEdge.isAddSpace() ? "blue" : "red";
                final String targetNodeName = getNodeName(targetNode);
                System.out.println("\t" + sourceNodeName + " -> " + targetNodeName + "[style=" + style + " color="
                        + color + "]");

                if (tokenizationGraphEdge.isInferred())
                    inferredEdgeCount++;
                else
                    instructedEdgeCount++;

                if (tokenizationGraphEdge.isAddSpace())
                    addSpaceRuleCount++;
                else
                    dontAddSpaceRuleCount++;
            }
        }

        System.out.println("// Number of nodes : " + nodeMap.size());
        System.out.println("// Number of edges: " + (inferredEdgeCount + instructedEdgeCount));
        System.out.println("// Instructed : " + instructedEdgeCount + ", inferred " + inferredEdgeCount);
        System.out.println("// Add space : " + addSpaceRuleCount + ", don't add space " + dontAddSpaceRuleCount);
        System.out.println();

        System.out.println("// Full graph node cound would be "
                + (TextBlockType.values().length * TextBlockType.values().length));
        System.out.println("// and in that case complete graph node count would be "
                + (TextBlockType.values().length * TextBlockType.values().length)
                        * (TextBlockType.values().length * TextBlockType.values().length));

        System.out.println("}");
    }

    protected String getNodeName(TokenizationGraphNode node) {
        final String str = Joiner.on('+')
                .join(Lists.transform(node.getData().getTextBlockTypes(), new Function<TextBlockType, String>() {
                    @Override
                    public String apply(TextBlockType input) {
                        return input.name();
                    }
                }));
        return "\"" + str + "\"";
    }

}