Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.any23.mime; import org.apache.any23.extractor.csv.CSVReaderBuilder; import org.apache.any23.mime.purifier.Purifier; import org.apache.any23.mime.purifier.WhiteSpacesPurifier; import org.apache.commons.io.input.CharSequenceReader; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFParser; import org.openrdf.rio.Rio; import org.openrdf.rio.helpers.StatementCollector; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.Collection; import java.util.regex.Pattern; /** * Implementation of {@link MIMETypeDetector} based on * <a href="http://lucene.apache.org/tika/">Apache Tika</a>. * * @author Michele Mostarda (michele.mostarda@gmail.com) * @author Davide Palmisano (dpalmisano@gmail.com) */ public class TikaMIMETypeDetector implements MIMETypeDetector { private Purifier purifier; // TODO: centralize mimetype strings somewhere. public static final String N3_MIMETYPE = "text/n3"; public static final String NQUADS_MIMETYPE = "text/nq"; public static final String TURTLE_MIMETYPE = "application/turtle"; public static final String CSV_MIMETYPE = "text/csv"; public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml"; /** * N3 patterns. */ private static final Pattern[] N3_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."), // * URI URI . Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."), // * URI BNODE . Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."), // * URI LLITERAL . Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") // * URI TLITERAL . }; /** * N-Quads patterns. */ private static final Pattern[] NQUADS_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."), // * URI URI URI . Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."), // * URI BNODE URI . Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."), // * URI LLITERAL URI . Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") // * URI TLITERAL URI . }; private static TikaConfig config = null; private static Tika tika; private static MimeTypes types; private static char[] n3InsideBlockChars = new char[] { '<' }; private static char[] n3LineCommentChars = new char[] { '#' }; private static char[] n3OutsideBlockChars = new char[] { '\n', '>' }; private static char[] n3SwitchBlockChars = new char[] { '"' }; private static char[] nquadsInsideBlockChars = new char[] { '<' }; private static char[] nquadsLineCommentChars = new char[] { '#' }; private static char[] nquadsOutsideBlockChars = new char[] { '\n' }; private static char[] nquadsSwitchBlockChars = new char[] { '"' }; private static char[] turtleInsideBlockChars = new char[] { '<' }; private static char[] turtleLineCommentChars = new char[] { '#' }; private static char[] turtleOutsideBlockChars = new char[] { '\n', '>' }; private static char[] turtleSwitchBlockChars = new char[] { '"' }; /** * Checks if the stream contains the <i>N3</i> triple patterns. * * @param is input stream to be verified. * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise. * @throws IOException */ public static boolean checkN3Format(InputStream is) throws IOException { return checkByRioFormat(RDFFormat.N3, is, n3InsideBlockChars, n3LineCommentChars, n3OutsideBlockChars, n3SwitchBlockChars); } /** * Checks if the stream contains the <i>NQuads</i> patterns. * * @param is input stream to be verified. * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise. * @throws IOException */ public static boolean checkNQuadsFormat(InputStream is) throws IOException { return checkByRioFormat(RDFFormat.NQUADS, is, nquadsInsideBlockChars, nquadsLineCommentChars, nquadsOutsideBlockChars, nquadsSwitchBlockChars); } /** * Checks if the stream contains <i>Turtle</i> triple patterns. * @param is input stream to be verified. * @param insideBlockChars TODO * @param outsideBlockChars TODO * @param switchBlockChars TODO * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise. * @throws IOException */ public static boolean checkByRioFormat(RDFFormat format, InputStream is, char[] insideBlockChars, char[] lineCommentChars, char[] outsideBlockChars, char[] switchBlockChars) throws IOException { StringBuilder sample = extractDataSample(is, '.', insideBlockChars, lineCommentChars, outsideBlockChars, switchBlockChars); RDFParser turtleParser = Rio.createParser(format); turtleParser.setDatatypeHandling(RDFParser.DatatypeHandling.VERIFY); turtleParser.setStopAtFirstError(true); turtleParser.setVerifyData(true); turtleParser.setRDFHandler(new StatementCollector()); Reader bais = new CharSequenceReader(sample); try { turtleParser.parse(bais, ""); return true; } catch (Exception e) { return false; } } /** * Checks if the stream contains a valid <i>CSV</i> pattern. * * @param is input stream to be verified. * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise. * @throws IOException */ public static boolean checkCSVFormat(InputStream is) throws IOException { return CSVReaderBuilder.isCSV(is); } /** * Tries to apply one of the given patterns on a sample of the input stream. * * @param patterns the patterns to apply. * @param delimiterChar the delimiter of the sample. * @param is the input stream to sample. * @param insideBlockCharacters * @param outsideBlockCharacters * @param switchBlockCharacters * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise. * @throws IOException */ private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is, char[] insideBlockCharacters, char[] lineCommentCharacters, char[] outsideBlockCharacters, char[] switchBlockCharacters) throws IOException { StringBuilder sample = extractDataSample(is, delimiterChar, insideBlockCharacters, lineCommentCharacters, outsideBlockCharacters, switchBlockCharacters); for (Pattern pattern : patterns) { if (pattern.matcher(sample).find()) { return true; } } return false; } /** * Extracts a sample data from the input stream, from the current * mark to the first <i>breakChar</i> char. * * @param is the input stream to sample. * @param breakChar the char to break to sample. * @return the sample string. * @throws IOException if an error occurs during sampling. */ private static StringBuilder extractDataSample(InputStream is, char breakChar, char[] insideBlockCharacters, char[] lineCommentChars, char[] outsideBlockCharacters, char[] switchBlockCharacters) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuilder sb = new StringBuilder(); // TODO: Make this configurable final int MAX_SIZE = 1024 * 2; int c; boolean insideBlock = false; int read = 0; br.mark(MAX_SIZE); try { while ((c = br.read()) != -1) { read++; if (sb.length() > MAX_SIZE) { break; } if (!insideBlock) { for (char nextLineCommentChar : lineCommentChars) { // if we hit a comment character that signals the rest of the line is a comment // then we do not want to extract any of the rest of the line, including the // comment character for our sample, so we read to the end of the line and then // continue the loop without appending anything if (c == nextLineCommentChar) { br.readLine(); continue; } } } for (char nextInsideChar : insideBlockCharacters) { if (c == nextInsideChar) insideBlock = true; } for (char nextOutsideChar : outsideBlockCharacters) { if (c == nextOutsideChar) insideBlock = false; } for (char nextSwitchChar : switchBlockCharacters) { if (c == nextSwitchChar) insideBlock = !insideBlock; } sb.append((char) c); if (!insideBlock && breakChar == c) { break; } } } finally { is.reset(); br.reset(); } return sb; } public TikaMIMETypeDetector(Purifier purifier) { this.purifier = purifier; InputStream is = getResourceAsStream(); if (config == null) { try { config = new TikaConfig(is); } catch (Exception e) { throw new RuntimeException("Error while loading Tika configuration.", e); } } if (types == null) { types = config.getMimeRepository(); } if (tika == null) { tika = new Tika(config); } } public TikaMIMETypeDetector() { this(new WhiteSpacesPurifier()); } /** * Estimates the <code>MIME</code> type of the content of input file. * The <i>input</i> stream must be resettable. * * @param fileName name of the data source. * @param input <code>null</code> or a <b>resettable</i> input stream containing data. * @param mimeTypeFromMetadata mimetype declared in metadata. * @return the supposed mime type or <code>null</code> if nothing appropriate found. * @throws IllegalArgumentException if <i>input</i> is not <code>null</code> and is not resettable. */ @Override public MIMEType guessMIMEType(String fileName, InputStream input, MIMEType mimeTypeFromMetadata) { if (input != null) { try { this.purifier.purify(input); } catch (IOException e) { throw new RuntimeException("Error while purifying the provided input", e); } } final Metadata meta = new Metadata(); if (mimeTypeFromMetadata != null) meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType()); if (fileName != null) meta.set(Metadata.RESOURCE_NAME_KEY, fileName); String type; try { final String mt = guessMimeTypeByInputAndMeta(input, meta); if (!MimeTypes.OCTET_STREAM.equals(mt)) { type = mt; } else { if (checkByRioFormat(RDFFormat.N3, input, n3InsideBlockChars, n3LineCommentChars, n3OutsideBlockChars, n3SwitchBlockChars)) { type = N3_MIMETYPE; } else if (checkByRioFormat(RDFFormat.NQUADS, input, nquadsInsideBlockChars, nquadsLineCommentChars, nquadsOutsideBlockChars, nquadsSwitchBlockChars)) { type = NQUADS_MIMETYPE; } else if (checkByRioFormat(RDFFormat.TURTLE, input, turtleInsideBlockChars, turtleLineCommentChars, turtleOutsideBlockChars, turtleSwitchBlockChars)) { type = TURTLE_MIMETYPE; } else if (checkCSVFormat(input)) { type = CSV_MIMETYPE; } else { type = MimeTypes.OCTET_STREAM; } } } catch (IOException ioe) { throw new RuntimeException("Error while retrieving mime type.", ioe); } return MIMEType.parse(type); } /** * Loads the <code>Tika</code> configuration file. * * @return the input stream containing the configuration. */ private InputStream getResourceAsStream() { InputStream result; result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME); if (result == null) { result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME); if (result == null) { result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME); } } return result; } /** * Automatically detects the MIME type of a document based on magic * markers in the stream prefix and any given metadata hints. * <p/> * The given stream is expected to support marks, so that this method * can reset the stream to the position it was in before this method * was called. * * @param stream document stream * @param metadata metadata hints * @return MIME type of the document * @throws IOException if the document stream could not be read */ private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) throws IOException { if (stream != null) { final String type = tika.detect(stream); if (type != null && !isGenericMIMEType(type)) { return type; } } // Determines the MIMEType based on Content-Type hint if available. final String contentType = metadata.get(Metadata.CONTENT_TYPE); String candidateMIMEType = null; if (contentType != null) { try { MimeType type = types.forName(contentType); if (type != null) { if (!isPlainMIMEType(type.getName())) { return type.getName(); } else { candidateMIMEType = type.getName(); } } } catch (MimeTypeException mte) { // Malformed ocntent-type value, ignore. } } // Determines the MIMEType based on resource name hint if available. final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); if (resourceName != null) { String type = tika.detect(resourceName); if (!isGenericMIMEType(type)) { return type; } RDFFormat parserFormatForFileName = Rio.getParserFormatForFileName(resourceName); // if Rio recognised it, then return the default MIME Type for the given format that it recognised if (parserFormatForFileName != null) { return parserFormatForFileName.getDefaultMIMEType(); } } // Finally, use the default type if no matches found if (candidateMIMEType != null) { return candidateMIMEType; } else { return MimeTypes.OCTET_STREAM; } } private boolean isPlainMIMEType(String type) { return type.equals(MimeTypes.OCTET_STREAM) || type.equals(MimeTypes.PLAIN_TEXT); } private boolean isGenericMIMEType(String type) { return isPlainMIMEType(type) || type.equals(MimeTypes.XML); } }