org.cloudgraph.examples.test.model.NLPWikiParseTest.java Source code

Introduction

Here is the source code for org.cloudgraph.examples.test.model.NLPWikiParseTest.java
Source

/**
 *        CloudGraph Community Edition (CE) License
 * 
 * This is a community release of CloudGraph, a dual-license suite of
 * Service Data Object (SDO) 2.1 services designed for relational and 
 * big-table style "cloud" databases, such as HBase and others. 
 * This particular copy of the software is released under the 
 * version 2 of the GNU General Public License. CloudGraph was developed by 
 * TerraMeta Software, Inc.
 * 
 * Copyright (c) 2013, TerraMeta Software, Inc. All rights reserved.
 * 
 * General License information can be found below.
 * 
 * This distribution may include materials developed by third
 * parties. For license and attribution notices for these
 * materials, please refer to the documentation that accompanies
 * this distribution (see the "Licenses for Third-Party Components"
 * appendix) or view the online documentation at 
 * <http://cloudgraph.org/licenses/>. 
 */
package org.cloudgraph.examples.test.model;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.BreakIterator;
import java.util.List;
import java.util.Locale;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.cloudgraph.common.CommonTest;
import org.cloudgraph.examples.corpus.parse.Document;
import org.cloudgraph.examples.corpus.parse.Sentence;
import org.cloudgraph.examples.corpus.wiki.Page;
import org.cloudgraph.examples.corpus.wiki.Revision;
import org.cloudgraph.examples.wikicorpus.nlp.SentenceBreak;
import org.cloudgraph.examples.wikicorpus.nlp.SentenceUtil;
import org.cloudgraph.examples.wikicorpus.nlp.WikiAnnotator;
import org.plasma.sdo.helper.PlasmaDataFactory;
import org.plasma.sdo.helper.PlasmaTypeHelper;
import org.plasma.sdo.helper.PlasmaXMLHelper;
import org.plasma.sdo.xml.DefaultOptions;

import commonj.sdo.DataGraph;
import commonj.sdo.Type;
import commonj.sdo.helper.XMLDocument;

import edu.jhu.nlp.wikipedia.PageCallbackHandler;
import edu.jhu.nlp.wikipedia.WikiPage;
import edu.jhu.nlp.wikipedia.WikiXMLParser;
import edu.jhu.nlp.wikipedia.WikiXMLParserFactory;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

/**
 */
public class NLPWikiParseTest extends CommonTest {
    private static Log log = LogFactory.getLog(NLPWikiParseTest.class);
    private static int SENTENCE_MAX = 300;
    private static int SENTENCE_MIN = 12;
    private StanfordCoreNLP pipeline;
    private WikiAnnotator annotator;

    public void setUp() throws Exception {
        super.setUp();
        if (annotator == null) {
            annotator = new WikiAnnotator();
            Properties props = new Properties();
            props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
            //props.put("annotators", "tokenize, ssplit, pos, lemma");
            pipeline = new StanfordCoreNLP(props);
        }
    }

    public void testParseLocal() throws Exception {
        WikiXMLParser wxsp = WikiXMLParserFactory
                .getSAXParser("../../data/enwiki-20130805-pages-articles1.xml-p000000010p000010000");
        wxsp.setPageCallback(new MyHandler());
        wxsp.parse();
    }

    class MyHandler implements PageCallbackHandler {

        private StanfordCoreNLP pipeline;
        {
            Properties props = new Properties();
            //props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
            props.put("annotators", "tokenize, ssplit, pos, lemma");
            pipeline = new StanfordCoreNLP(props);

        }

        @Override
        public void process(WikiPage wikiPage) {

            String redirectPage = wikiPage.getRedirectPage();
            // skip parsing
            if (redirectPage != null && redirectPage.length() > 0) {
                return; // have no text
            }
            if (wikiPage.getTitle().startsWith("File:")) {
                return; // have no text
            }
            byte[] bytes = wikiPage.getText().getBytes(Charset.forName("UTF-8"));
            String plainText = new String(bytes, Charset.forName("UTF-8"));
            if (plainText == null || plainText.trim().length() == 0) {
                log.info("ignoring: " + wikiPage.getTitle());
                return;
            }

            if (annotator.alphaCharCount(plainText) < 40) {
                log.info("ignoring: " + wikiPage.getTitle());
                return;
            }

            //log.info("TITLE: " + page.getTitle());
            //StringBuilder buf = new StringBuilder();
            //for (String cat : page.getCategories()) {
            //   buf.append("\t" + cat);
            //}
            //log.info("CATEGORIES: " + buf.toString());
            //if (page.getInfoBox() != null)
            //    log.info("INFOBOX: " + page.getInfoBox().dumpRaw());
            //buf = new StringBuilder();
            //for (String link : page.getLinks()) {
            //   buf.append("\t" + link);
            //}
            //log.info("LINKS: " + buf.toString());
            //log.info("TEXT: " + page.getText());
            writePlanTextFile(wikiPage, plainText);

            DataGraph dataGraph = PlasmaDataFactory.INSTANCE.createDataGraph();
            dataGraph.getChangeSummary().beginLogging();
            Type rootType = PlasmaTypeHelper.INSTANCE.getType(Page.class);
            log.info("creating: " + wikiPage.getTitle() + "(" + wikiPage.getID() + ")");
            Page page = (Page) dataGraph.createRootObject(rootType);
            page.setPageTitle(wikiPage.getTitle());
            page.setPageId(Integer.valueOf(wikiPage.getID()));
            Revision revision = page.createRevision();
            revision.setRevId(revision.getRevId());
            org.cloudgraph.examples.corpus.wiki.Text text = revision.createPlainText();
            text.setOldText(bytes);

            DataGraph docDataGraph = PlasmaDataFactory.INSTANCE.createDataGraph();
            docDataGraph.getChangeSummary().beginLogging();
            rootType = PlasmaTypeHelper.INSTANCE.getType(Document.class);
            Document parseDocument = (Document) docDataGraph.createRootObject(rootType);

            parse(plainText, parseDocument);

            String xml;
            try {
                xml = serialize(parseDocument);
                log.info(xml);
                //text.setParseXmlText(xml);
            } catch (Exception e) {
                log.error(e.getMessage(), e);
            }

            for (Sentence sent : parseDocument.getSentence()) {
                Sentence previous = sent.getPrevious();
                Sentence next = sent.getNext();
                int i = 0;
                i++;
            }

        }

    }

    private void parse(String plainText, Document parseDocument) {
        Sentence previousSentence = null;
        Sentence currentSentence = null;
        Sentence nextSentence = null;
        SentenceBreak[] breaks = SentenceUtil.getBreaks(plainText);

        for (int i = 0; i < breaks.length; i++) {
            SentenceBreak brk = breaks[i];

            previousSentence = currentSentence;
            currentSentence = nextSentence;

            String nextSentenceText = plainText.substring(brk.firstIndex, brk.lastIndex);
            nextSentence = parseDocument.createSentence();
            //currentSentence.setText(currentSentenceText); // another mapper pass

            if (currentSentence != null) {
                if (previousSentence != null)
                    currentSentence.setPrevious(previousSentence);
                if (nextSentence != null)
                    currentSentence.setNext(nextSentence);
            }

            // decide if we're going to actually parse it
            if (nextSentenceText.length() > SENTENCE_MAX) {
                if (log.isDebugEnabled())
                    log.debug("sentence exceeded max length (" + SENTENCE_MAX + ") - ignoring");
                continue;
            }
            if (nextSentenceText.length() < SENTENCE_MIN) {
                if (log.isDebugEnabled())
                    log.debug("sentence under min length (" + SENTENCE_MIN + ") - ignoring");
                continue;
            }
            if (!this.annotator.onlyAlphaAndPunctuation(nextSentenceText)) {
                if (log.isDebugEnabled())
                    log.debug("sentence contains non punctuation chars - ignoring");
                continue;
            }

            // ok parse it
            long before = System.currentTimeMillis();
            Annotation document = new Annotation(nextSentenceText);
            this.pipeline.annotate(document);

            List<CoreMap> sentences = document.get(SentencesAnnotation.class);
            int parseSentCount = 0;
            for (CoreMap sentenceAnnotation : sentences) {
                parseSentCount++;
                if (parseSentCount > 1) {
                    log.warn("parsed multiple sentences from sentence break - ignoring");
                    continue;
                }
                this.annotator.buildSentence(this.pipeline, sentenceAnnotation, nextSentence, brk.firstIndex,
                        brk.lastIndex, parseDocument);
            }

            long after = System.currentTimeMillis();
            if (log.isDebugEnabled())
                log.debug("parse: " + String.valueOf(after - before) + ": " + nextSentenceText);
        }
    }

    private void writePlanTextFile(WikiPage page, String text) {
        FileOutputStream os = null;
        try {
            os = new FileOutputStream(new File("./target/", page.getTitle().trim().replace("/", "_") + ".txt"));
            OutputStreamWriter writer = new OutputStreamWriter(os, "UTF-8");
            writer.write(page.getText());
        } catch (IOException e) {
            log.error(e);
        } finally {
            try {
                os.flush();
                os.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
    }

    private void parse(StringBuilder buf) throws IOException {
        BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);

        String text = buf.toString();
        int counter = 0;
        iterator.setText(text);

        int lastIndex = iterator.first();
        while (lastIndex != BreakIterator.DONE) {
            int firstIndex = lastIndex;
            lastIndex = iterator.next();

            if (lastIndex != BreakIterator.DONE) {
                String sentence = text.substring(firstIndex, lastIndex);
                long before = System.currentTimeMillis();
                //parse(sentence);
                long after = System.currentTimeMillis();
                log.info("time4: " + String.valueOf(after - before) + ": " + sentence);
                counter++;
            }
        }

    }

    private String serialize(Document parseDocument) throws IOException {
        DefaultOptions options = new DefaultOptions(parseDocument.getType().getURI());
        options.setRootNamespacePrefix("ns1");
        XMLDocument doc = PlasmaXMLHelper.INSTANCE.createDocument(parseDocument, parseDocument.getType().getURI(),
                null);
        options.setPrettyPrint(false);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        PlasmaXMLHelper.INSTANCE.save(doc, os, options);
        os.flush();
        return new String(os.toByteArray(), "UTF-8");
    }

}