Java tutorial
/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.frankenstein; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.Span; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.apache.poi.hwpf.usermodel.Paragraph; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Parse Frankenstein book (located in the test resources folder), identifies sentences and then * indexes them into Lucene. */ public class Frankenstein { protected RAMDirectory directory; protected IndexSearcher searcher; protected SentenceDetector sentenceDetector; protected Map<String, NameFinderME> finders; protected Tokenizer tokenizer; public static void main(String[] args) throws Exception { //<start id="frank.start"/> Frankenstein frankenstein = new Frankenstein(); frankenstein.init(); frankenstein.index();//<co id="frank.index"/> String query = null; while (true) { query = getQuery();//<co id="frank.query"/> if (query != null) { Results results = frankenstein.search(query);//<co id="frank.search"/> frankenstein.examineResults(results);//<co id="frank.exam"/> displayResults(results); } else { break; } } /* <calloutlist> <callout arearefs="frank.index"><para>Make the content searchable</para></callout> <callout arearefs="frank.query"><para>Prompt the user for a query</para></callout> <callout arearefs="frank.search"><para>Perform the search</para></callout> <callout arearefs="frank.exam"><para>Parse the results and show interesting items</para></callout> </calloutlist> */ //<end id="frank.start"/> } private void examineResults(Results results) { for (Document match : results.matches) { //we have a paragraph, let's break sentences and then do NER String[] sentencesStr = sentenceDetector.sentDetect(match.get("paragraph")); if (sentencesStr != null && sentencesStr.length > 0) { Sentence[] sentences = new Sentence[sentencesStr.length]; results.sentences.put(match.get("id"), sentences); //for each sentence, find named entities for (int i = 0; i < sentencesStr.length; i++) { sentences[i] = new Sentence(sentencesStr[i]); String[] tokens = tokenizer.tokenize(sentencesStr[i]); for (Map.Entry<String, NameFinderME> finder : finders.entrySet()) { String label = finder.getKey(); Span[] names = finder.getValue().find(tokens); //spans index into the tokens array if (names != null && names.length > 0) { List<String> values = new ArrayList<String>(); for (int j = 0; j < names.length; j++) { StringBuffer cb = new StringBuffer(); for (int ti = names[j].getStart(); ti < names[j].getEnd(); ti++) { cb.append(tokens[ti]).append(" "); } values.add(cb.toString()); } sentences[i].names.put(label, values); } } } } } } /** * Search for the queryStr in the text * * @param queryStr The query string * @return The Results * @throws IOException * @throws ParseException */ private Results search(String queryStr) throws IOException, ParseException { System.out.println("Searching for: " + queryStr); if (searcher == null) { searcher = new IndexSearcher(directory, true); } Results result = new Results(); QueryParser qp = new QueryParser(Version.LUCENE_36, "paragraph", new StandardAnalyzer(Version.LUCENE_36)); Query query = qp.parse(queryStr); TopDocs topDocs = searcher.search(query, 20); System.out.println("Found " + topDocs.totalHits + " total hits."); for (int i = 0; i < topDocs.scoreDocs.length; i++) { Document theDoc = searcher.doc(topDocs.scoreDocs[i].doc); result.matches.add(theDoc); } return result; } /** * Index the content of Frankenstein * * @throws IOException */ private void index() throws IOException { System.out.println("Indexing Frankenstein"); InputStream stream = getClass().getClassLoader().getResourceAsStream("frankenstein-gutenberg.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); //let's index paragraphs at a time IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); directory = new RAMDirectory(); IndexWriter iw = new IndexWriter(directory, conf); String line; StringBuilder paraBuffer = new StringBuilder(2048); int lines = 0; int paragraphs = 0; int paragraphLines = 0; while ((line = reader.readLine()) != null) { if (line.contains("End of the Project Gutenberg")) {//we are in the license section at the end of the book break; } if (line.startsWith("#")) {//skip comments continue; } //if the line is blank, we have a paragraph, so let's index it if (line.matches("^\\s*$") && paraBuffer.length() > 0) { Document doc = new Document(); //We can retrieve by paragraph number if we want String theString = paraBuffer.toString(); theString.trim(); if (theString.length() > 0 && theString.matches("^\\s*$") == false) { addMetadata(doc, lines, paragraphs, paragraphLines); doc.add(new Field("paragraph", theString, Field.Store.YES, Field.Index.ANALYZED));//add the main content iw.addDocument(doc);//Index the document paragraphs++; } //reset some of our state paraBuffer.setLength(0);//we are done w/ this paragraph paragraphLines = 0; } else { paraBuffer.append(line).append(' '); } lines++; paragraphLines++; } System.out.println("Processed " + lines + " lines. Paragraphs: " + paragraphs); iw.close(); } private void addMetadata(Document doc, int lines, int paragraphs, int paragraphLines) { doc.add(new Field("id", "frank_" + paragraphs, Field.Store.YES, Field.Index.NOT_ANALYZED)); NumericField startLine = new NumericField("startLine", Field.Store.YES, true); startLine.setIntValue(lines - paragraphLines); doc.add(startLine); NumericField finishLine = new NumericField("finishLine", Field.Store.YES, true); finishLine.setIntValue(lines); doc.add(finishLine); NumericField paragraphNumber = new NumericField("paragraphNumber", Field.Store.YES, true); paragraphNumber.setIntValue(paragraphs); doc.add(paragraphNumber); } /** * Initialize OpenNLP libraries and other resources * @throws IOException */ private void init() throws IOException { System.out.println("Initializing Frankenstein"); File models = new File("../../opennlp-models"); File wordnet = new File("../../WordNet-3.0"); if (models.exists() == false) { throw new FileNotFoundException("../../opennlp-models"); } System.setProperty("model.dir", "../../opennlp-models"); System.setProperty("wordnet.dir", "../../WordNet-3.0"); File modelFile = new File(models, "en-sent.bin"); InputStream modelStream = new FileInputStream(modelFile); SentenceModel model = new SentenceModel(modelStream); sentenceDetector = new SentenceDetectorME(model); finders = new HashMap<String, NameFinderME>(); finders.put("Names", new NameFinderME(new TokenNameFinderModel(new FileInputStream(getPersonModel())))); finders.put("Dates", new NameFinderME(new TokenNameFinderModel(new FileInputStream(getDateModel())))); finders.put("Locations", new NameFinderME(new TokenNameFinderModel(new FileInputStream(getLocationModel())))); tokenizer = SimpleTokenizer.INSTANCE; } private static String getQuery() throws IOException { System.out.println(""); System.out.println( "Type your query. Hit Enter to process the query (the empty string will exit the program):"); System.out.print('>'); System.out.flush(); BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); String line = in.readLine(); if (line == null || line.length() == -1 || line.equals("")) { return null; } return line; } private static void displayResults(Results results) { int k = 0; for (Document document : results.matches) { System.out.println("-----------------------------------"); System.out.println("Match: [" + k + "] Paragraph: " + document.get("paragraphNumber")); System.out.println("Lines: " + document.get("startLine") + "-" + document.get("finishLine")); System.out.println("\t" + document.get("paragraph")); System.out.println("\t----- Sentences ----"); Sentence[] sentences = results.sentences.get(document.get("id")); for (int i = 0; i < sentences.length; i++) { Sentence sentence = sentences[i]; System.out.println("\t\t[" + i + "] " + sentence.sentence); if (sentence.names.isEmpty() == false) { for (Map.Entry<String, List<String>> entry : sentence.names.entrySet()) { System.out.println("\t\t>>>> " + entry.getKey()); StringBuffer buff = new StringBuffer(); if (entry.getValue().isEmpty() == false) { for (String val : entry.getValue()) { buff.append(val.trim()).append(", "); } buff.setLength(buff.length() - 2);//drop the last comma and space System.out.println("\t\t\t" + buff); } } System.out.println(""); } } k++; } } public static File getWordNetDir() { String wordnetDir = System.getProperty("wordnet.dir"); return new File(wordnetDir); } public static File getWordNetDictionary() { return new File(getWordNetDir(), "dict"); } public static File getModelDir() { String modelsDirProp = System.getProperty("model.dir"); return new File(modelsDirProp); } public static File getPersonModel() { return new File(getModelDir(), "en-ner-person.bin"); } public static File getDateModel() { return new File(getModelDir(), "en-ner-date.bin"); } public static File getLocationModel() { return new File(getModelDir(), "en-ner-location.bin"); } } class Results { public List<Document> matches = new ArrayList<Document>(); public Map<String, Sentence[]> sentences = new HashMap<String, Sentence[]>(); } class Sentence { public String sentence; public Map<String, List<String>> names = new HashMap<String, List<String>>(); public Sentence(String sent) { sentence = sent; } }