Example usage for org.apache.commons.io.input BoundedInputStream BoundedInputStream

List of usage examples for org.apache.commons.io.input BoundedInputStream BoundedInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input BoundedInputStream BoundedInputStream.

Prototype

public BoundedInputStream(InputStream in, long size) 

Source Link

Document

Creates a new BoundedInputStream that wraps the given input stream and limits it to a certain size.

Usage

From source file:org.xdi.oxauth.cert.validation.CRLCertificateVerifier.java

public X509CRL requestCRL(String url)
        throws IOException, MalformedURLException, CertificateException, CRLException {
    HttpURLConnection con = (HttpURLConnection) new URL(url).openConnection();
    try {//  w  w  w . j av a 2 s. c  o  m
        con.setUseCaches(false);

        InputStream in = new BoundedInputStream(con.getInputStream(), maxCrlSize);
        try {
            CertificateFactory certificateFactory = CertificateFactory.getInstance("X.509");
            X509CRL crl = (X509CRL) certificateFactory.generateCRL(in);
            log.debug("CRL size: " + crl.getEncoded().length + " bytes");

            return crl;
        } finally {
            IOUtils.closeQuietly(in);
        }
    } catch (IOException ex) {
        log.error("Failed to download CRL from '" + url + "'", ex);
    } finally {
        if (con != null) {
            con.disconnect();
        }
    }

    return null;
}

From source file:uk.bl.wa.analyser.payload.TikaPayloadAnalyser.java

/**
 * @param source the source of the input stream - typically a WARC file. Used for error logging.
 * @param solr  /*from ww w  .j av a2  s.  c om*/
 * @param is  content to analyse.
 * @param url optional URL for the bytes in is.
 * @return
 * @throws IOException
 */
@SuppressWarnings("deprecation")
public SolrRecord extract(String source, SolrRecord solr, InputStream is, String url) throws IOException {

    // Set up the TikaInputStream:
    TikaInputStream tikainput = null;
    if (this.maxBytesToParser > 0) {
        tikainput = TikaInputStream
                .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser));
    } else {
        tikainput = TikaInputStream.get(new CloseShieldInputStream(is));
    }

    // Also pass URL as metadata to allow extension hints to work:
    Metadata metadata = new Metadata();
    if (url != null)
        metadata.set(Metadata.RESOURCE_NAME_KEY, url);

    final long detectStart = System.nanoTime();
    StringBuilder detected = new StringBuilder();
    try {
        DetectRunner detect = new DetectRunner(source, tika, tikainput, detected, metadata);
        TimeLimiter.run(detect, 10000L, false);
    } catch (NoSuchFieldError e) {
        // TODO Is this an Apache POI version issue?
        log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source);
        addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName()));
    } catch (Exception e) {
        log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source);
        addExceptionMetadata(metadata, e);
    }
    Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect",
            detectStart);

    // Only proceed if we have a suitable type:
    if (!this.checkMime(detected.toString())) {
        if ("".equals(detected.toString())) {
            solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString());
        } else {
            solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString());
        }
        return solr;
    }

    // Context
    ParseContext context = new ParseContext();
    StringWriter content = new StringWriter();

    // Override the recursive parsing:
    if (embedded == null)
        embedded = new NonRecursiveEmbeddedDocumentExtractor(context);
    context.set(EmbeddedDocumentExtractor.class, embedded);

    try {
        final long parseStart = System.nanoTime();
        ParseRunner runner = new ParseRunner(source, tika.getParser(), tikainput, this.getHandler(content),
                metadata, context);
        try {
            TimeLimiter.run(runner, parseTimeout, true);
        } catch (OutOfMemoryError o) {
            log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage() + " for " + url + " in "
                    + source);
            addExceptionMetadata(metadata, new Exception("OutOfMemoryError"));
        } catch (RuntimeException r) {
            log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage() + " for " + url + " in "
                    + source);
            addExceptionMetadata(metadata, r);
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse",
                parseStart);

        // If there was a parse error, report it:
        String tikaException = metadata.get(TikaPayloadAnalyser.TIKA_PARSE_EXCEPTION);
        if (tikaException != null) {
            solr.addParseException(tikaException, new RuntimeException("Exception from Tika"));
        }

        final long extractStart = System.nanoTime();
        // Copy the body text, forcing a UTF-8 encoding:
        String output = new String(content.toString().getBytes("UTF-8"));
        if (runner.complete || !output.equals("")) {
            if (output.length() > this.max_text_length) {
                output = output.substring(0, this.max_text_length);
            }
            log.debug("Extracted text from: " + url + " in " + source);
            log.debug("Extracted text: " + StringUtils.left(output, 300));
            solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output);
            solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length()));
        } else {
            //log.debug("Failed to extract any text from: "+url);
        }

        // Noisily report all metadata properties:
        /*
         * for( String m : metadata.names() ) {
         * log.info("For "+url.substring(url.length() - (int)
         * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); }
         */

        // Attempt to record all metadata discovered:
        if (this.extractAllMetadata) {
            for (String m : metadata.names()) {
                // Ignore these as they are not very interesting:
                if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m)
                        || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m)
                        || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m)
                        || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m)
                        || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) {
                    continue;
                }
                // Record in the document, but trim big ones:
                String value = metadata.get(m);
                if (value != null && value.length() > 100) {
                    value = value.substring(0, 100);
                }
                solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value);
            }
        }

        // Also Pick out particular metadata:
        String contentType = metadata.get(Metadata.CONTENT_TYPE);
        solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType);
        solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE));
        solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION));
        solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords"));
        solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR));
        solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING));

        // Parse out any embedded date that can act as a created/modified date.
        // I was not able find a single example where both created and modified where defined and different. I single field is sufficient.
        String date = null;
        if (metadata.get(Metadata.CREATION_DATE) != null)
            date = metadata.get(Metadata.CREATION_DATE);

        if (metadata.get(Metadata.DATE) != null)
            date = metadata.get(Metadata.DATE);
        if (metadata.get(Metadata.MODIFIED) != null)
            date = metadata.get(Metadata.MODIFIED);
        if (date != null) {
            DateTimeFormatter df = ISODateTimeFormat.dateTimeParser();
            DateTime edate = null;
            try {
                edate = df.parseDateTime(date);
            } catch (IllegalArgumentException e) {
                log.error("Could not parse date: " + date + " from URL " + url + " in " + source);
            }
            if (edate == null) {
                Date javadate = Times.extractDate(date);
                if (javadate != null)
                    edate = new org.joda.time.DateTime(javadate);
            }
            if (edate != null) {
                solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear());
                DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC);
                // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED,
                // edate);
                solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate));
            }
        }

        // Also look to record the software identifiers:

        // Look for generic xmp:CreatorTool
        solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool"));
        // For PDF, support other metadata tags:
        //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author.
        solr.addField(SolrFields.GENERATOR, metadata.get("producer"));
        solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE));
        solr.addField(SolrFields.GENERATOR, metadata.get("software"));
        solr.addField(SolrFields.GENERATOR, metadata.get("Software"));
        solr.addField(SolrFields.GENERATOR, metadata.get("generator"));
        solr.addField(SolrFields.GENERATOR, metadata.get("Generator"));
        solr.addField(SolrFields.GENERATOR, metadata.get("ProgId"));

        //handle image EXIF metaformat
        String exifVersion = metadata.get("Exif Version");

        if (exifVersion != null) {
            solr.addField(SolrFields.EXIF_VERSION, exifVersion);
            String exif_artist = metadata.get("Artist");
            if (exif_artist != null) { // This is a better value for the author field
                // This potentially results in multiple author, which is valid
                solr.addField(SolrFields.SOLR_AUTHOR, exif_artist);
            }

            if (this.extractExifLocation) {

                String exif_latitude = metadata.get("GPS Latitude");
                String exif_longitude = metadata.get("GPS Longitude");

                if (exif_latitude != null && exif_longitude != null) {
                    double latitude = DMS2DG(exif_latitude);
                    double longitude = DMS2DG(exif_longitude);

                    try {
                        if (latitude != 0d && longitude != 0d) { // Sometimes they are defined but both 0
                            if (latitude <= 90 && latitude >= -90 && longitude <= 180 && longitude >= -180) {
                                solr.addField(SolrFields.EXIF_LOCATION, latitude + "," + longitude);
                            } else {
                                log.warn(
                                        "invalid gsp exif information:" + exif_latitude + "," + exif_longitude);
                            }

                        }
                    } catch (Exception e) { //Just ignore. No GPS data added to solr
                        log.warn("error parsing exif gps data. latitude:" + exif_latitude + " longitude:"
                                + exif_longitude);
                    }
                }
            }
        }
        //End image exif metadata

        // Application ID, MS Office only AFAICT, and the VERSION is only doc
        String software = null;
        if (metadata.get(Metadata.APPLICATION_NAME) != null)
            software = metadata.get(Metadata.APPLICATION_NAME);
        if (metadata.get(Metadata.APPLICATION_VERSION) != null)
            software += " " + metadata.get(Metadata.APPLICATION_VERSION);
        // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software',
        // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71'
        String png_textentry = metadata.get("tEXt tEXtEntry");
        if (png_textentry != null && png_textentry.contains("keyword=Software, value="))
            software = png_textentry.replace("keyword=Software, value=", "");
        /* Some JPEGs have this:
        Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality
        comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality
         */
        if (software != null) {
            solr.addField(SolrFields.GENERATOR, software);
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract",
                extractStart);

    } catch (Exception e) {
        log.error("TikaExtractor.extract(): " + e.getMessage() + " for URL " + url + " in " + source);
    }

    // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close
    if (tikainput != null) {
        try {
            tikainput.close();
        } catch (IOException e) {
            log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage() + " for "
                    + url + " in " + source);
        }
    }

    return solr;
}

From source file:uk.bl.wa.solr.TikaExtractor.java

/**
 * /*from www  .  j a v a2 s .  com*/
 * @param solr
 * @param is
 * @param url
 * @return
 * @throws IOException
 */
@SuppressWarnings("deprecation")
public SolrRecord extract(SolrRecord solr, InputStream is, String url) throws IOException {

    // Set up the TikaInputStream:
    TikaInputStream tikainput = null;
    if (this.maxBytesToParser > 0) {
        tikainput = TikaInputStream
                .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser));
    } else {
        tikainput = TikaInputStream.get(new CloseShieldInputStream(is));
    }

    // Also pass URL as metadata to allow extension hints to work:
    Metadata metadata = new Metadata();
    if (url != null)
        metadata.set(Metadata.RESOURCE_NAME_KEY, url);

    final long detectStart = System.nanoTime();
    StringBuilder detected = new StringBuilder();
    try {
        DetectRunner detect = new DetectRunner(tika, tikainput, detected, metadata);
        Thread detectThread = new Thread(detect, Long.toString(System.currentTimeMillis()));
        detectThread.start();
        detectThread.join(10000L);
        detectThread.interrupt();
    } catch (NoSuchFieldError e) {
        // TODO Is this an Apache POI version issue?
        log.error("Tika.detect(): " + e.getMessage());
        addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName()));
    } catch (Exception e) {
        log.error("Tika.detect(): " + e.getMessage());
        addExceptionMetadata(metadata, e);
    }
    Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect",
            detectStart);

    // Only proceed if we have a suitable type:
    if (!this.checkMime(detected.toString())) {
        if ("".equals(detected.toString())) {
            solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString());
        } else {
            solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString());
        }
        return solr;
    }

    // Context
    ParseContext context = new ParseContext();
    StringWriter content = new StringWriter();

    // Override the recursive parsing:
    if (embedded == null)
        embedded = new NonRecursiveEmbeddedDocumentExtractor(context);
    context.set(EmbeddedDocumentExtractor.class, embedded);

    try {
        final long parseStart = System.nanoTime();
        ParseRunner runner = new ParseRunner(tika.getParser(), tikainput, this.getHandler(content), metadata,
                context);
        Thread parseThread = new Thread(runner, Long.toString(System.currentTimeMillis()));
        try {
            parseThread.start();
            parseThread.join(this.parseTimeout);
            parseThread.interrupt();
            parseThread.join(this.parseTimeout);
        } catch (OutOfMemoryError o) {
            log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage());
            addExceptionMetadata(metadata, new Exception("OutOfMemoryError"));
        } catch (RuntimeException r) {
            log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage());
            addExceptionMetadata(metadata, r);
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse",
                parseStart);

        // If there was a parse error, report it:
        solr.addField(SolrFields.PARSE_ERROR, metadata.get(TikaExtractor.TIKA_PARSE_EXCEPTION));

        final long extractStart = System.nanoTime();
        // Copy the body text, forcing a UTF-8 encoding:
        String output = new String(content.toString().getBytes("UTF-8"));
        if (runner.complete || !output.equals("")) {
            if (output.length() > this.max_text_length) {
                output = output.substring(0, this.max_text_length);
            }
            log.debug("Extracted text from: " + url);
            log.debug("Extracted text: " + StringUtils.left(output, 300));
            solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output);
            solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length()));
        } else {
            //log.debug("Failed to extract any text from: "+url);
        }

        // Noisily report all metadata properties:
        /*
         * for( String m : metadata.names() ) {
         * log.info("For "+url.substring(url.length() - (int)
         * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); }
         */

        // Attempt to record all metadata discovered:
        if (this.extractAllMetadata) {
            for (String m : metadata.names()) {
                // Ignore these as they are not very interesting:
                if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m)
                        || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m)
                        || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m)
                        || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m)
                        || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) {
                    continue;
                }
                // Record in the document, but trim big ones:
                String value = metadata.get(m);
                if (value != null && value.length() > 100) {
                    value = value.substring(0, 100);
                }
                solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value);
            }
        }

        // Also Pick out particular metadata:
        String contentType = metadata.get(Metadata.CONTENT_TYPE);
        solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType);
        solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE));
        solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION));
        solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords"));
        solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR));
        solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING));

        // Parse out any embedded date that can act as a created/modified date.
        String date = null;
        if (metadata.get(Metadata.CREATION_DATE) != null)
            date = metadata.get(Metadata.CREATION_DATE);
        if (metadata.get(Metadata.DATE) != null)
            date = metadata.get(Metadata.DATE);
        if (metadata.get(Metadata.MODIFIED) != null)
            date = metadata.get(Metadata.MODIFIED);
        if (date != null) {
            DateTimeFormatter df = ISODateTimeFormat.dateTimeParser();
            DateTime edate = null;
            try {
                edate = df.parseDateTime(date);
            } catch (IllegalArgumentException e) {
                log.error("Could not parse: " + date);
            }
            if (edate == null) {
                Date javadate = Times.extractDate(date);
                if (javadate != null)
                    edate = new org.joda.time.DateTime(javadate);
            }
            if (edate != null) {
                solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear());
                DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC);
                // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED,
                // edate);
                solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate));
            }
        }

        // Also look to record the software identifiers:

        // Look for generic xmp:CreatorTool
        solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool"));
        // For PDF, support other metadata tags:
        //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author.
        solr.addField(SolrFields.GENERATOR, metadata.get("producer"));
        solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE));
        solr.addField(SolrFields.GENERATOR, metadata.get("generator"));
        solr.addField(SolrFields.GENERATOR, metadata.get("Software"));

        // Application ID, MS Office only AFAICT, and the VERSION is only doc
        String software = null;
        if (metadata.get(Metadata.APPLICATION_NAME) != null)
            software = metadata.get(Metadata.APPLICATION_NAME);
        if (metadata.get(Metadata.APPLICATION_VERSION) != null)
            software += " " + metadata.get(Metadata.APPLICATION_VERSION);
        // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software',
        // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71'
        String png_textentry = metadata.get("tEXt tEXtEntry");
        if (png_textentry != null && png_textentry.contains("keyword=Software, value="))
            software = png_textentry.replace("keyword=Software, value=", "");
        /* Some JPEGs have this:
        Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality
        comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality
         */
        if (software != null) {
            solr.addField(SolrFields.GENERATOR, software);
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract",
                extractStart);

    } catch (Exception e) {
        log.error("TikaExtractor.extract(): " + e.getMessage());
    }

    // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close
    if (tikainput != null) {
        try {
            tikainput.close();
        } catch (IOException e) {
            log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage());
        }
    }

    return solr;
}