Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.csv; import java.io.BufferedReader; import java.io.EOFException; import java.io.IOException; import java.io.PushbackReader; import java.io.Reader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.input.ProxyReader; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; class CSVSniffer { private static final int DEFAULT_MARK_LIMIT = 10000; private static final int PUSH_BACK = 2; static final int EOF = -1; static final int NEW_LINE = '\n'; static final int CARRIAGE_RETURN = '\r'; private static final int SPACE = ' '; private final char[] delimiters; private final int markLimit; CSVSniffer(char[] delimiters) { this(DEFAULT_MARK_LIMIT, delimiters); } CSVSniffer(int markLimit, char[] delimiters) { this.markLimit = markLimit; this.delimiters = delimiters; } List<CSVResult> sniff(Reader reader) throws IOException { if (!reader.markSupported()) { reader = new BufferedReader(reader); } List<CSVResult> ret = new ArrayList<>(); for (char delimiter : delimiters) { reader.mark(markLimit); try { CSVResult result = new Snifflet(delimiter).sniff(reader); ret.add(result); } finally { reader.reset(); } } Collections.sort(ret); return ret; } /** * @param reader * @param metadata * @return the best result given the detection results or {@link CSVResult#TEXT} * if the confidence is not above a threshold. * @throws IOException */ CSVResult getBest(Reader reader, Metadata metadata) throws IOException { //TODO: take into consideration the filename. Perhaps require //a higher confidence if detection contradicts filename? List<CSVResult> results = sniff(reader); if (results == null || results.size() == 0) { return CSVResult.TEXT; } CSVResult bestResult = results.get(0); if (bestResult.getConfidence() < 0.10) { return CSVResult.TEXT; } return bestResult; } //inner class that tests a single hypothesis/combination //of parameters for delimiter and quote character //this will throw an EOF before reading beyond the //markLimit number of characters (not bytes!) private class Snifflet { private final char delimiter; //hardcode this for now private final char quoteCharacter = '"'; Map<Integer, MutableInt> rowLengthCounts = new HashMap<>(); int charsRead = 0; int colCount = 0; int encapsulated = 0; //number of cells that are encapsulated in dquotes (for now) boolean parseException = false; public Snifflet(char delimiter) { this.delimiter = delimiter; } CSVResult sniff(Reader r) throws IOException { boolean eof = false; boolean hitMarkLimit = false; int lastC = -1; StringBuilder unquoted = new StringBuilder(); try (PushbackReader reader = new PushbackReader(new CloseShieldReader(r), PUSH_BACK)) { int c = read(reader); while (c != EOF) { if (c == quoteCharacter) { handleUnquoted(unquoted); //test to make sure there isn't an unencapsulated quote character in the middle of a cell if (lastC > -1 && lastC != delimiter && lastC != NEW_LINE && lastC != CARRIAGE_RETURN) { parseException = true; return calcResult(); } //TODO: test to make sure cell doesn't start with escaped ""the quick brown cat" boolean correctlyEncapsulated = consumeQuoted(reader, quoteCharacter); if (!correctlyEncapsulated) { parseException = true; return calcResult(); } } else if (c == delimiter) { handleUnquoted(unquoted); endColumn(); consumeSpaceCharacters(reader); } else if (c == NEW_LINE || c == CARRIAGE_RETURN) { if (unquoted.length() > 0) { endColumn(); } handleUnquoted(unquoted); endRow(); consumeNewLines(reader); } else { unquoted.append((char) c); } lastC = c; c = read(reader); } } catch (HitMarkLimitException e) { hitMarkLimit = true; } catch (UnsurprisingEOF e) { //totally ignore } catch (EOFException e) { //the consume* throw this to avoid //having to check -1 every time and //having to rely on potentially wonky //inputstreams not consistently returning -1 //after hitting EOF and returning the first -1. //Yes. That's a thing. eof = true; } finally { r.reset(); } //if you've hit the marklimit or an eof on a truncated file //don't add the last row's info if (!hitMarkLimit && !eof && lastC != NEW_LINE && lastC != CARRIAGE_RETURN) { handleUnquoted(unquoted); endColumn(); endRow(); } return calcResult(); } private CSVResult calcResult() { double confidence = getConfidence(); MediaType mediaType = TextAndCSVParser.CSV; if (delimiter == '\t') { mediaType = TextAndCSVParser.TSV; } return new CSVResult(confidence, mediaType, delimiter); } private void handleUnquoted(StringBuilder unquoted) { if (unquoted.length() > 0) { unquoted(unquoted.toString()); unquoted.setLength(0); } } void consumeSpaceCharacters(PushbackReader reader) throws IOException { int c = read(reader); while (c == SPACE) { c = read(reader); } if (c == EOF) { throw new UnsurprisingEOF(); } unread(reader, c); } /** * * @param reader * @param quoteCharacter * @return whether or not this was a correctly encapsulated cell * @throws UnsurprisingEOF if the file ended immediately after the close quote * @throws EOFException if the file ended in the middle of the encapsulated section * @throws IOException on other IOExceptions */ boolean consumeQuoted(PushbackReader reader, int quoteCharacter) throws IOException { //this currently assumes excel "escaping" of double quotes: //'the " quick' -> "the "" quick" //we can make this more interesting later with other //escaping options int c = read(reader); while (c != -1) { if (c == quoteCharacter) { int nextC = read(reader); if (nextC == EOF) { encapsulated++; endColumn(); throw new UnsurprisingEOF(); } else if (nextC != quoteCharacter) { encapsulated++; endColumn(); unread(reader, nextC); consumeSpaceCharacters(reader); //now make sure that the next character is eof, \r\n //or a delimiter nextC = read(reader); if (nextC == EOF) { throw new UnsurprisingEOF(); } else if (nextC == NEW_LINE || nextC == CARRIAGE_RETURN) { unread(reader, nextC); return true; } else if (nextC != delimiter) { unread(reader, nextC); return false; } unread(reader, nextC); return true; } } c = read(reader); } throw new EOFException(); } private int read(PushbackReader reader) throws IOException { if (charsRead >= markLimit - 1) { throw new HitMarkLimitException(); } int c = reader.read(); if (c == EOF) { return EOF; } charsRead++; return c; } private void unread(PushbackReader reader, int c) throws IOException { if (c != EOF) { reader.unread(c); charsRead--; } } //consume all consecutive '\r\n' in any order void consumeNewLines(PushbackReader reader) throws IOException { int c = read(reader); while (c == NEW_LINE || c == CARRIAGE_RETURN) { c = read(reader); } if (c == EOF) { throw new EOFException(); } unread(reader, c); return; } void endColumn() { colCount++; } void endRow() { MutableInt cnt = rowLengthCounts.get(colCount); if (cnt == null) { cnt = new MutableInt(1); rowLengthCounts.put(colCount, cnt); } else { cnt.increment(); } colCount = 0; } void unquoted(String string) { //TODO -- do some analysis to make sure you don't have //large tokens like 2,3,2,3,2,3, } double getConfidence() { double confidence = 0.0f; if (parseException) { return -1.0f; } //TODO -- add tests for long tokens containing //other delimiters, e.g. the,quick,brown,fox as a token //when testing '\t' double colCountConsistencyConf = calculateColumnCountConsistency(); if (colCountConsistencyConf > -1.0) { confidence = colCountConsistencyConf; } //the idea is that if there are a bunch of encapsulated //cells, then that should outweigh column length inconsistency //this particular formula offers a small initial increase //that eventually approaches 1.0 double encapsulatedBonus = 0; if (encapsulated > 0) { encapsulatedBonus = 1.0 - (1.0d / Math.pow(encapsulated, 0.2)); } return Math.min(confidence + encapsulatedBonus, 1.0); } private double calculateColumnCountConsistency() { int max = -1; int totalRows = 0; //find the most common row for (Map.Entry<Integer, MutableInt> e : rowLengthCounts.entrySet()) { int numCols = e.getKey(); int count = e.getValue().intValue(); //require that numCols > 1 so that you had at least //one delimiter in that row if (numCols > 1 && count > max) { max = count; } totalRows += count; } //if there's not enough info if (max < 0 || totalRows < 3) { return 0.0; } //TODO: convert this to continuous vs vague heuristic step function double consistency = (double) max / (double) totalRows; return ((1d - (1d / Math.pow(totalRows, 0.3))) * consistency); } } private static class UnsurprisingEOF extends EOFException { } private static class HitMarkLimitException extends EOFException { } private class CloseShieldReader extends ProxyReader { public CloseShieldReader(Reader r) { super(r); } @Override public void close() throws IOException { //do nothing } } }