Java tutorial
/* Copyright 2014 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.norconex.importer.handler.tagger.impl; import java.io.IOException; import java.io.Reader; import java.math.BigDecimal; import java.text.BreakIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.stream.XMLStreamException; import org.apache.commons.configuration.XMLConfiguration; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; import com.norconex.commons.lang.config.IXMLConfigurable; import com.norconex.commons.lang.xml.EnhancedXMLStreamWriter; import com.norconex.importer.doc.ImporterMetadata; import com.norconex.importer.handler.ImporterHandlerException; import com.norconex.importer.handler.tagger.AbstractCharStreamTagger; /** * <p>Analyzes the content of the supplied document and adds statistical * information about its content or field as metadata fields. Default * behavior provide the statistics about the content. Refer to the following * for the new metadata fields to be created along with their description.</p> * * <table border="1"> * <caption>Statistic fields</caption> * <tr> * <th>Field name</th> * <th>Description</th> * </tr> * <tr> * <td>document.stat.characterCount</td> * <td>Total number of characters (excluding carriage returns/line * feed).</td> * </tr> * <tr> * <td>document.stat.wordCount</td> * <td>Total number of words.</td> * </tr> * <tr> * <td>document.stat.sentenceCount</td> * <td>Total number of sentences.</td> * </tr> * <tr> * <td>document.stat.paragraphCount</td> * <td>Total number of paragraph.</td> * </tr> * <tr> * <td>document.stat.averageWordCharacterCount</td> * <td>Average number of character in every words.</td> * </tr> * <tr> * <td>document.stat.averageSentenceCharacterCount</td> * <td>Average number of character in sentences (including non-word * characters, such as spaces, or slashes).</td> * </tr> * <tr> * <td>document.stat.averageSentenceWordCount</td> * <td>Average number of words per sentences.</td> * </tr> * <tr> * <td>document.stat.averageParagraphCharacterCount</td> * <td>Average number of characters in paragraphs (including non-word * characters, such as spaces, or slashes).</td> * </tr> * <tr> * <td>document.stat.averageParagraphSentenceCount</td> * <td>Average number of sentences per paragraphs.</td> * </tr> * <tr> * <td>document.stat.averageParagraphWordCount</td> * <td>Average number of words per paragraphs.</td> * </tr> * </table> * * <p>You can specify a field name to obtain statistics about that field instead. * When you do so, the field name will be inserted in the above * names, right after "document.stat.". E.g.: * <code>document.stat.myfield.characterCount</code></p> * * <p>Can be used both as a pre-parse (text-only) or post-parse handler.</p> * * <p>XML configuration usage:</p> * <pre> * <tagger class="com.norconex.importer.handler.tagger.impl.TextStatisticsTagger" * fieldName="(optional field name instead of using content)" > * * <restrictTo caseSensitive="[false|true]" * field="(name of header/metadata field name to match)"> * (regular expression of value to match) * </restrictTo> * <!-- multiple "restrictTo" tags allowed (only one needs to match) --> * </tagger> * </pre> * @author Pascal Essiembre * @since 2.0.0 */ @SuppressWarnings("nls") public class TextStatisticsTagger extends AbstractCharStreamTagger implements IXMLConfigurable { private static final Pattern PATTERN_WORD = Pattern.compile("\\w+\\-{0,1}\\w*", Pattern.UNICODE_CHARACTER_CLASS); private String fieldName; @Override protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { long charCount = 0; long wordCharCount = 0; long wordCount = 0; long sentenceCount = 0; long sentenceCharCount = 0; long paragraphCount = 0; //TODO make this more efficient, by doing all this in one pass. LineIterator it = IOUtils.lineIterator(input); while (it.hasNext()) { String line = it.nextLine().trim(); if (StringUtils.isBlank(line)) { continue; } // Paragraph paragraphCount++; // Character charCount += line.length(); // Word Matcher matcher = PATTERN_WORD.matcher(line); while (matcher.find()) { int wordLength = matcher.end() - matcher.start(); wordCount++; wordCharCount += wordLength; } // Sentence BreakIterator boundary = BreakIterator.getSentenceInstance(); boundary.setText(line); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { sentenceCharCount += (end - start); sentenceCount++; } } String field = StringUtils.EMPTY; if (StringUtils.isNotBlank(fieldName)) { field = fieldName.trim() + "."; } //--- Add fields --- metadata.addLong("document.stat." + field + "characterCount", charCount); metadata.addLong("document.stat." + field + "wordCount", wordCount); metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount); metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount); metadata.addString("document.stat." + field + "averageWordCharacterCount", divide(wordCharCount, wordCount)); metadata.addString("document.stat." + field + "averageSentenceCharacterCount", divide(sentenceCharCount, sentenceCount)); metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount)); metadata.addString("document.stat." + field + "averageParagraphCharacterCount", divide(charCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphSentenceCount", divide(sentenceCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphWordCount", divide(wordCount, paragraphCount)); } private String divide(long value, long divisor) { return BigDecimal.valueOf(value).divide(BigDecimal.valueOf(divisor), 1, BigDecimal.ROUND_HALF_UP) .toString(); } public String getFieldName() { return fieldName; } public void setFieldName(String fieldName) { this.fieldName = fieldName; } @Override protected void loadHandlerFromXML(XMLConfiguration xml) throws IOException { setFieldName(xml.getString("[@fieldName]", getFieldName())); } @Override protected void saveHandlerToXML(EnhancedXMLStreamWriter writer) throws XMLStreamException { if (StringUtils.isNotBlank(fieldName)) { writer.writeAttribute("fieldName", fieldName); } } }