Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.parse_thicket.kernel_interface; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import opennlp.tools.jsmlearning.ProfileReaderWriter; import opennlp.tools.parse_thicket.ParseThicket; import opennlp.tools.parse_thicket.VerbNetProcessor; import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor; import opennlp.tools.parse_thicket.matching.Matcher; public class TreeKernelBasedClassifierMultiplePara extends TreeKernelBasedClassifier { boolean bShortRun = false; public void setShortRun() { bShortRun = true; } public void trainClassifier(String posDirectory, String negDirectory) { queuePos.clear(); queueNeg.clear(); addFilesPos(new File(posDirectory)); addFilesNeg(new File(negDirectory)); List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = new ArrayList<File>(queueNeg); Collection<String> treeBankBuffer = new ArrayList<String>(); int countPos = 0, countNeg = 0; for (File f : filesPos) { // get first paragraph of text List<String> texts = DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f); List<String> lines = formTreeKernelStructuresMultiplePara(texts, "1"); treeBankBuffer.addAll(lines); if (bShortRun && countPos > 3000) break; countPos++; } for (File f : filesNeg) { // get first paragraph of text List<String> texts = DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f); List<String> lines = formTreeKernelStructuresMultiplePara(texts, "-1"); treeBankBuffer.addAll(lines); if (bShortRun && countNeg > 3000) break; countNeg++; } // write the lists of samples to a file try { FileUtils.writeLines(new File(path + trainingFileName), null, treeBankBuffer); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // ProfileReaderWriter.writeReport(treeBankBuffer, // path+trainingFileName, ' '); // build the model tkRunner.runLearner(path, trainingFileName, modelFileName); } public List<String[]> classifyFilesInDirectory(String dirFilesToBeClassified) { Map<Integer, Integer> countObject = new HashMap<Integer, Integer>(); int itemCount = 0, objectCount = 0; List<String> treeBankBuffer = new ArrayList<String>(); queuePos.clear(); addFilesPos(new File(dirFilesToBeClassified)); List<File> filesUnkn = new ArrayList<File>(queuePos); for (File f : filesUnkn) { List<String> texts = DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f); List<String> lines = formTreeKernelStructuresMultiplePara(texts, "0"); for (String l : lines) { countObject.put(itemCount, objectCount); itemCount++; } objectCount++; treeBankBuffer.addAll(lines); } // write the lists of samples to a file try { FileUtils.writeLines(new File(path + unknownToBeClassified), null, treeBankBuffer); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput); // read classification results List<String[]> classifResults = ProfileReaderWriter.readProfiles(path + classifierOutput, ' '); // iterate through classification results and set them as scores for // hits List<String[]> results = new ArrayList<String[]>(); itemCount = 0; objectCount = 0; int currentItemCount = 0; float accum = 0; for (String[] line : classifResults) { Float val = Float.parseFloat(line[0]); accum += val; // last line Boolean bLastLine = false; if (itemCount == classifResults.size() - 1) bLastLine = true; if (objectCount == countObject.get(itemCount) /* && !bLastLine */) { itemCount++; currentItemCount++; continue; } else while (objectCount != countObject.get(itemCount) - 1) { objectCount++; String[] rline = new String[] { filesUnkn.get(objectCount).getName(), "unknown", "0", filesUnkn.get(objectCount).getAbsolutePath(), new Integer(itemCount).toString(), new Integer(objectCount).toString() }; results.add(rline); } objectCount = countObject.get(itemCount); itemCount++; float averaged = accum / (float) currentItemCount; currentItemCount = 0; Boolean in = false; if (averaged > MIN_SVM_SCORE_TOBE_IN) in = true; String[] rline = new String[] { filesUnkn.get(objectCount).getName(), in.toString(), new Float(averaged).toString(), filesUnkn.get(objectCount).getAbsolutePath(), new Integer(itemCount).toString(), new Integer(objectCount).toString() }; results.add(rline); accum = 0; } return results; } protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) { List<String> extendedTreesDumpTotal = new ArrayList<String>(); try { for (String text : texts) { // get the parses from original documents, and form the training // dataset System.out.println("About to build pt from " + text); ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text); System.out.print("About to build extended forest "); List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt); for (String line : extendedTreesDump) extendedTreesDumpTotal.add(flag + " |BT| " + line + " |ET| "); System.out.println("DONE"); } } catch (Exception e) { e.printStackTrace(); } return extendedTreesDumpTotal; } public static void main(String[] args) { VerbNetProcessor p = VerbNetProcessor .getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources"); TreeKernelBasedClassifierMultiplePara proc = new TreeKernelBasedClassifierMultiplePara(); proc.setKernelPath( "/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/"); proc.trainClassifier("/Users/bgalitsky/Documents/ENRON/detectors/design_docs/docs/design_doc_posNeg/pos", "/Users/bgalitsky/Documents/ENRON/detectors/design_docs/docs/design_doc_posNeg/neg"); // "/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted", // "/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi"); // List<String[]>res = proc.classifyFilesInDirectory(args[2]); // ProfileReaderWriter.writeReport(res, "svmDesignDocReport05plus.csv"); } } /* * Number of examples: 12767, linear space size: 10 * * estimating ... Setting default regularization parameter C=1.0000 * Optimizing................................................................... * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................................. * ............................................................. Checking * optimality of inactive variables...done. Number of inactive variables = 3632 * done. (5288 iterations) Optimization finished (628 misclassified, * maxdiff=0.00099). Runtime in cpu-seconds: 879.17 Number of SV: 7723 * (including 2703 at upper bound) L1 loss: loss=1731.86774 Norm of weight * vector: |w|=69.71561 Norm of longest example vector: |x|=1.00000 Estimated * VCdim of classifier: VCdim<=4861.26666 Computing XiAlpha-estimates...done * Runtime for XiAlpha-estimates in cpu-seconds: 0.13 XiAlpha-estimate of the * error: error<=30.35% (rho=1.00,depth=0) XiAlpha-estimate of the recall: * recall=>76.31% (rho=1.00,depth=0) XiAlpha-estimate of the precision: * precision=>66.10% (rho=1.00,depth=0) */