List of usage examples for org.apache.commons.lang StringUtils left
public static String left(String str, int len)
Gets the leftmost len
characters of a String.
From source file:pl.otros.logview.gui.message.update.MessageUpdateUtils.java
public String formatMessageWithTimeLimit(final String s1, final MessageFormatter messageFormatter, int timeoutSeconds) { String result = s1;//from w w w .j av a 2s.com Callable<String> callable = new Callable<String>() { @Override public String call() throws Exception { ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(messageFormatter.getClass().getClassLoader()); if (messageFormatter.formattingNeeded(s1)) { return messageFormatter.format(s1); } } catch (Throwable e) { LOGGER.severe(String.format("Error occurred when using message formatter %s: %s", messageFormatter.getName(), e.getMessage())); LOGGER.fine(String.format("Error occurred when using message formatter %s with message\"%s\"", messageFormatter.getName(), StringUtils.left(s1, 1500))); } finally { Thread.currentThread().setContextClassLoader(contextClassLoader); } return s1; } }; Future<String> submit = executorService.submit(callable); try { result = submit.get(timeoutSeconds, TimeUnit.SECONDS); } catch (TimeoutException e) { String msg = String.format("Formatting message with %s takes to long time, skipping this formatter", messageFormatter.getName()); LOGGER.warning(msg); submit.cancel(true); } catch (Exception e) { LOGGER.severe(String.format("Error occurred when using message formatter %s: %s", messageFormatter.getName(), e.getMessage())); submit.cancel(true); } return result; }
From source file:pl.otros.logview.gui.message.update.MessageUpdateUtils.java
public Collection<MessageFragmentStyle> colorizeMessageWithTimeLimit(final String message, final int messageStartOffset, final MessageColorizer messageColorizer, int timeoutSeconds) { Callable<Collection<MessageFragmentStyle>> callable = new Callable<Collection<MessageFragmentStyle>>() { @Override//from www .ja v a 2 s . co m public Collection<MessageFragmentStyle> call() throws Exception { Collection<MessageFragmentStyle> list = new ArrayList<MessageFragmentStyle>(); ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(messageColorizer.getClass().getClassLoader()); if (messageColorizer.colorizingNeeded(message)) { Collection<MessageFragmentStyle> colorize = messageColorizer.colorize(message); for (MessageFragmentStyle messageFragmentStyle : colorize) { messageFragmentStyle.setOffset(messageFragmentStyle.getOffset() + messageStartOffset); } list.addAll(colorize); } } catch (Throwable e) { LOGGER.log(Level.SEVERE, String.format("Error occurred when using message colorizer %s: %s%n%s", messageColorizer.getName(), e.getMessage()), e); LOGGER.fine(String.format("Error occurred when using message colorizer %s with message\"%s\"", messageColorizer.getName(), StringUtils.left(message, 1500))); e.printStackTrace(); } finally { Thread.currentThread().setContextClassLoader(contextClassLoader); } return list; } }; Future<Collection<MessageFragmentStyle>> submit = executorService.submit(callable); try { return submit.get(timeoutSeconds, TimeUnit.SECONDS); } catch (TimeoutException e) { String msg = String.format("Formatting message with %s takes to long time, skipping this formatter", messageColorizer.getName()); LOGGER.warning(msg); submit.cancel(true); } catch (Exception e) { LOGGER.severe(String.format("Error occurred when using message formatter %s: %s", messageColorizer.getName(), e.getMessage())); submit.cancel(true); } return new ArrayList<MessageFragmentStyle>(0); }
From source file:pl.otros.logview.gui.MessageDetailListener.java
public void updateInfo() { Collection<MessageFormatter> formatters = formattersContainer.getElements(); Collection<MessageColorizer> colorizers = colorizersContainer.getElements(); int row = table.getSelectedRow(); if (row >= 0 && row < table.getRowCount()) { logDetailTextArea.setText(""); int rowConverted = table.convertRowIndexToModel(row); LogData ld = dataTableModel.getLogData(rowConverted); StyledDocument document = logDetailTextArea.getStyledDocument(); synchronized (document) { try { document.remove(0, document.getLength()); String s1 = "Date: " + dateFormat.format(ld.getDate()) + "\n"; document.insertString(0, s1, mainStyle); s1 = "Class: " + ld.getClazz() + "\n"; document.insertString(document.getLength(), s1, classMethodStyle); s1 = "Method: " + ld.getMethod() + "\n"; document.insertString(document.getLength(), s1, classMethodStyle); s1 = "Level: "; document.insertString(document.getLength(), s1, classMethodStyle); Icon levelIcon = LevelRenderer.getIconByLevel(ld.getLevel()); if (levelIcon != null) { logDetailTextArea.insertIcon(levelIcon); }/*from w ww . ja va2s. c o m*/ s1 = " " + ld.getLevel().getName() + "\n"; document.insertString(document.getLength(), s1, classMethodStyle); s1 = "Message: "; document.insertString(document.getLength(), s1, mainStyle); int beforeMessage = document.getLength(); s1 = ld.getMessage(); if (s1.length() > maximumMessageSize) { int removedCharsSize = s1.length() - maximumMessageSize; s1 = StringUtils.left(s1, maximumMessageSize) + String.format("%n...%n...(+%,d chars)", removedCharsSize); } for (MessageFormatter messageFormatter : formatters) { ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread() .setContextClassLoader(messageFormatter.getClass().getClassLoader()); if (messageFormatter.formattingNeeded(s1)) { s1 = messageFormatter.format(s1); } } catch (Throwable e) { LOGGER.severe(String.format("Error occured when using message formatter %s: %s", messageFormatter.getName(), e.getMessage())); LOGGER.fine(String.format( "Error occured when using message formatter %s with message\"%s\"", messageFormatter.getName(), s1)); } finally { Thread.currentThread().setContextClassLoader(contextClassLoader); } } document.insertString(document.getLength(), s1, mainStyle); searchResultMessageColorizer = null; for (MessageColorizer messageColorizer : colorizers) { ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread() .setContextClassLoader(messageColorizer.getClass().getClassLoader()); if (messageColorizer.colorizingNeeded(s1)) { messageColorizer.colorize(document, beforeMessage, document.getLength() - beforeMessage); } } catch (Throwable e) { LOGGER.severe(String.format("Error occured when using message colorizer %s: %s", messageColorizer.getName(), e.getMessage())); LOGGER.fine(String.format( "Error occured when using message colorizer %s with message\"%s\"", messageColorizer.getName(), s1)); } finally { Thread.currentThread().setContextClassLoader(contextClassLoader); } if (messageColorizer.getPluginableId().equals(SearchResultColorizer.class.getName())) { searchResultMessageColorizer = messageColorizer; } } if (searchResultMessageColorizer != null && searchResultMessageColorizer.colorizingNeeded(s1)) { searchResultMessageColorizer.colorize(document, beforeMessage, document.getLength() - beforeMessage); } document.insertString(document.getLength(), "\n", mainStyle); if (ld.getProperties() != null && ld.getProperties().size() > 0) { document.insertString(document.getLength(), "\nProperties:\n", noteStyle); String prop = Joiner.on("\n").withKeyValueSeparator("=").join(ld.getProperties()); document.insertString(document.getLength(), prop, noteStyle); document.insertString(document.getLength(), "\n", noteStyle); } Note note = dataTableModel.getNote(rowConverted); if (note != null && note.getNote() != null && note.getNote().length() > 0) { s1 = "\nNote: " + note.getNote(); document.insertString(document.getLength(), s1, noteStyle); } } catch (BadLocationException e) { LOGGER.warning("Cant set message details: " + e.getMessage()); } } } else { StyledDocument document = logDetailTextArea.getStyledDocument(); synchronized (document) { try { document.remove(0, document.getLength()); document.insertString(0, "No event selected", mainStyle); } catch (BadLocationException e) { LOGGER.warning("Cant set message details: " + e.getMessage()); } } } logDetailTextArea.setCaretPosition(0); }
From source file:uk.bl.wa.analyser.payload.TikaPayloadAnalyser.java
/** * @param source the source of the input stream - typically a WARC file. Used for error logging. * @param solr //from w w w . j av a 2 s . c om * @param is content to analyse. * @param url optional URL for the bytes in is. * @return * @throws IOException */ @SuppressWarnings("deprecation") public SolrRecord extract(String source, SolrRecord solr, InputStream is, String url) throws IOException { // Set up the TikaInputStream: TikaInputStream tikainput = null; if (this.maxBytesToParser > 0) { tikainput = TikaInputStream .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser)); } else { tikainput = TikaInputStream.get(new CloseShieldInputStream(is)); } // Also pass URL as metadata to allow extension hints to work: Metadata metadata = new Metadata(); if (url != null) metadata.set(Metadata.RESOURCE_NAME_KEY, url); final long detectStart = System.nanoTime(); StringBuilder detected = new StringBuilder(); try { DetectRunner detect = new DetectRunner(source, tika, tikainput, detected, metadata); TimeLimiter.run(detect, 10000L, false); } catch (NoSuchFieldError e) { // TODO Is this an Apache POI version issue? log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName())); } catch (Exception e) { log.error("Tika.detect(): " + e.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, e); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect", detectStart); // Only proceed if we have a suitable type: if (!this.checkMime(detected.toString())) { if ("".equals(detected.toString())) { solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString()); } else { solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString()); } return solr; } // Context ParseContext context = new ParseContext(); StringWriter content = new StringWriter(); // Override the recursive parsing: if (embedded == null) embedded = new NonRecursiveEmbeddedDocumentExtractor(context); context.set(EmbeddedDocumentExtractor.class, embedded); try { final long parseStart = System.nanoTime(); ParseRunner runner = new ParseRunner(source, tika.getParser(), tikainput, this.getHandler(content), metadata, context); try { TimeLimiter.run(runner, parseTimeout, true); } catch (OutOfMemoryError o) { log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, new Exception("OutOfMemoryError")); } catch (RuntimeException r) { log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage() + " for " + url + " in " + source); addExceptionMetadata(metadata, r); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse", parseStart); // If there was a parse error, report it: String tikaException = metadata.get(TikaPayloadAnalyser.TIKA_PARSE_EXCEPTION); if (tikaException != null) { solr.addParseException(tikaException, new RuntimeException("Exception from Tika")); } final long extractStart = System.nanoTime(); // Copy the body text, forcing a UTF-8 encoding: String output = new String(content.toString().getBytes("UTF-8")); if (runner.complete || !output.equals("")) { if (output.length() > this.max_text_length) { output = output.substring(0, this.max_text_length); } log.debug("Extracted text from: " + url + " in " + source); log.debug("Extracted text: " + StringUtils.left(output, 300)); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length())); } else { //log.debug("Failed to extract any text from: "+url); } // Noisily report all metadata properties: /* * for( String m : metadata.names() ) { * log.info("For "+url.substring(url.length() - (int) * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); } */ // Attempt to record all metadata discovered: if (this.extractAllMetadata) { for (String m : metadata.names()) { // Ignore these as they are not very interesting: if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m) || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m) || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m) || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m) || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) { continue; } // Record in the document, but trim big ones: String value = metadata.get(m); if (value != null && value.length() > 100) { value = value.substring(0, 100); } solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value); } } // Also Pick out particular metadata: String contentType = metadata.get(Metadata.CONTENT_TYPE); solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType); solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE)); solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION)); solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords")); solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR)); solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING)); // Parse out any embedded date that can act as a created/modified date. // I was not able find a single example where both created and modified where defined and different. I single field is sufficient. String date = null; if (metadata.get(Metadata.CREATION_DATE) != null) date = metadata.get(Metadata.CREATION_DATE); if (metadata.get(Metadata.DATE) != null) date = metadata.get(Metadata.DATE); if (metadata.get(Metadata.MODIFIED) != null) date = metadata.get(Metadata.MODIFIED); if (date != null) { DateTimeFormatter df = ISODateTimeFormat.dateTimeParser(); DateTime edate = null; try { edate = df.parseDateTime(date); } catch (IllegalArgumentException e) { log.error("Could not parse date: " + date + " from URL " + url + " in " + source); } if (edate == null) { Date javadate = Times.extractDate(date); if (javadate != null) edate = new org.joda.time.DateTime(javadate); } if (edate != null) { solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear()); DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC); // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED, // edate); solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate)); } } // Also look to record the software identifiers: // Look for generic xmp:CreatorTool solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool")); // For PDF, support other metadata tags: //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author. solr.addField(SolrFields.GENERATOR, metadata.get("producer")); solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE)); solr.addField(SolrFields.GENERATOR, metadata.get("software")); solr.addField(SolrFields.GENERATOR, metadata.get("Software")); solr.addField(SolrFields.GENERATOR, metadata.get("generator")); solr.addField(SolrFields.GENERATOR, metadata.get("Generator")); solr.addField(SolrFields.GENERATOR, metadata.get("ProgId")); //handle image EXIF metaformat String exifVersion = metadata.get("Exif Version"); if (exifVersion != null) { solr.addField(SolrFields.EXIF_VERSION, exifVersion); String exif_artist = metadata.get("Artist"); if (exif_artist != null) { // This is a better value for the author field // This potentially results in multiple author, which is valid solr.addField(SolrFields.SOLR_AUTHOR, exif_artist); } if (this.extractExifLocation) { String exif_latitude = metadata.get("GPS Latitude"); String exif_longitude = metadata.get("GPS Longitude"); if (exif_latitude != null && exif_longitude != null) { double latitude = DMS2DG(exif_latitude); double longitude = DMS2DG(exif_longitude); try { if (latitude != 0d && longitude != 0d) { // Sometimes they are defined but both 0 if (latitude <= 90 && latitude >= -90 && longitude <= 180 && longitude >= -180) { solr.addField(SolrFields.EXIF_LOCATION, latitude + "," + longitude); } else { log.warn( "invalid gsp exif information:" + exif_latitude + "," + exif_longitude); } } } catch (Exception e) { //Just ignore. No GPS data added to solr log.warn("error parsing exif gps data. latitude:" + exif_latitude + " longitude:" + exif_longitude); } } } } //End image exif metadata // Application ID, MS Office only AFAICT, and the VERSION is only doc String software = null; if (metadata.get(Metadata.APPLICATION_NAME) != null) software = metadata.get(Metadata.APPLICATION_NAME); if (metadata.get(Metadata.APPLICATION_VERSION) != null) software += " " + metadata.get(Metadata.APPLICATION_VERSION); // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software', // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71' String png_textentry = metadata.get("tEXt tEXtEntry"); if (png_textentry != null && png_textentry.contains("keyword=Software, value=")) software = png_textentry.replace("keyword=Software, value=", ""); /* Some JPEGs have this: Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality */ if (software != null) { solr.addField(SolrFields.GENERATOR, software); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract", extractStart); } catch (Exception e) { log.error("TikaExtractor.extract(): " + e.getMessage() + " for URL " + url + " in " + source); } // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close if (tikainput != null) { try { tikainput.close(); } catch (IOException e) { log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage() + " for " + url + " in " + source); } } return solr; }
From source file:uk.bl.wa.parsers.HtmlFeatureParser.java
/** * /*w ww. j av a 2 s .c o m*/ */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { final long start = System.nanoTime(); // Pick up the URL: String url = metadata.get(Metadata.RESOURCE_NAME_KEY); // Parse it using JSoup Document doc = null; try { doc = Jsoup.parse(stream, null, url, parser); } catch (java.nio.charset.IllegalCharsetNameException e) { log.warn("Jsoup parse had to assume UTF-8: " + e); doc = Jsoup.parse(stream, "UTF-8", url); } catch (Exception e) { log.error("Jsoup parse failed: " + e); } finally { if (doc == null) return; } Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#jsoupparse", start); final long nonJsoupStart = System.nanoTime(); // Record the number of errors found: if (parser.getErrors() != null) metadata.set(NUM_PARSE_ERRORS, parser.getErrors().size()); // Get the links (no image links): Set<String> links = this.extractLinks(doc); if (links != null && links.size() > 0) { metadata.set(LINK_LIST, links.toArray(new String[links.size()])); } //get the image links if (extractImageLinks) { Set<String> imageLinks = this.extractImageLinks(doc); if (imageLinks != null && imageLinks.size() > 0) { metadata.set(IMAGE_LINKS, imageLinks.toArray(new String[imageLinks.size()])); } } // Get the publication date, from BBC pages: for (Element meta : doc.select("meta[name=OriginalPublicationDate]")) { metadata.set(ORIGINAL_PUB_DATE, meta.attr("content")); //log.debug(ORIGINAL_PUB_DATE + ": " + meta.attr("content")); } // Grab the first paragraph with text, and extract the text: for (Element p : doc.select("p")) { String pt = p.text(); if (pt != null) { pt = pt.trim(); if (pt.length() > 0) { metadata.set(FIRST_PARAGRAPH, p.text()); //log.debug(FIRST_PARAGRAPH + ": " +p.text() ); break; } } } // Grab the list of distinct elements used in the page: Set<String> de = new HashSet<String>(); for (Element e : doc.select("*")) { // ELEMENT_NAME matching to weed out the worst false positives caused by JavaScript // This handles cases such as '<script> if (3<a) console.log('something');' where 'a' would have been // seen as a tag, but does not handle cases such as '<script> if ( 3<a ) console.log('something');' where // the a is still seen as a tag because it is followed by a space if (!"#root".equals(e.tag().getName()) && ELEMENT_NAME.matcher(e.tag().getName()).matches()) { de.add(StringUtils.left(e.tag().getName().toLowerCase(Locale.ENGLISH), 100)); } } // For some elements, dig deeper and record attributes too: for (Element e : doc.select("link")) { de.add("link/@rel=" + e.attr("rel")); } // Store them: metadata.set(DISTINCT_ELEMENTS, de.toArray(new String[] {})); // Licence field, following: // http://www.whatwg.org/specs/web-apps/current-work/multipage/links.html#link-type-license for (Element a : doc.select("a[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } for (Element a : doc.select("link[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } for (Element a : doc.select("area[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#featureextract", nonJsoupStart); }
From source file:uk.bl.wa.solr.TikaExtractor.java
/** * // w w w . j a va 2 s . c o m * @param solr * @param is * @param url * @return * @throws IOException */ @SuppressWarnings("deprecation") public SolrRecord extract(SolrRecord solr, InputStream is, String url) throws IOException { // Set up the TikaInputStream: TikaInputStream tikainput = null; if (this.maxBytesToParser > 0) { tikainput = TikaInputStream .get(new BoundedInputStream(new CloseShieldInputStream(is), maxBytesToParser)); } else { tikainput = TikaInputStream.get(new CloseShieldInputStream(is)); } // Also pass URL as metadata to allow extension hints to work: Metadata metadata = new Metadata(); if (url != null) metadata.set(Metadata.RESOURCE_NAME_KEY, url); final long detectStart = System.nanoTime(); StringBuilder detected = new StringBuilder(); try { DetectRunner detect = new DetectRunner(tika, tikainput, detected, metadata); Thread detectThread = new Thread(detect, Long.toString(System.currentTimeMillis())); detectThread.start(); detectThread.join(10000L); detectThread.interrupt(); } catch (NoSuchFieldError e) { // TODO Is this an Apache POI version issue? log.error("Tika.detect(): " + e.getMessage()); addExceptionMetadata(metadata, new Exception("detect threw " + e.getClass().getCanonicalName())); } catch (Exception e) { log.error("Tika.detect(): " + e.getMessage()); addExceptionMetadata(metadata, e); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#detect", detectStart); // Only proceed if we have a suitable type: if (!this.checkMime(detected.toString())) { if ("".equals(detected.toString())) { solr.addField(SolrFields.SOLR_CONTENT_TYPE, MediaType.APPLICATION_OCTET_STREAM.toString()); } else { solr.addField(SolrFields.SOLR_CONTENT_TYPE, detected.toString()); } return solr; } // Context ParseContext context = new ParseContext(); StringWriter content = new StringWriter(); // Override the recursive parsing: if (embedded == null) embedded = new NonRecursiveEmbeddedDocumentExtractor(context); context.set(EmbeddedDocumentExtractor.class, embedded); try { final long parseStart = System.nanoTime(); ParseRunner runner = new ParseRunner(tika.getParser(), tikainput, this.getHandler(content), metadata, context); Thread parseThread = new Thread(runner, Long.toString(System.currentTimeMillis())); try { parseThread.start(); parseThread.join(this.parseTimeout); parseThread.interrupt(); parseThread.join(this.parseTimeout); } catch (OutOfMemoryError o) { log.error("TikaExtractor.parse() - OutOfMemoryError: " + o.getMessage()); addExceptionMetadata(metadata, new Exception("OutOfMemoryError")); } catch (RuntimeException r) { log.error("TikaExtractor.parse() - RuntimeException: " + r.getMessage()); addExceptionMetadata(metadata, r); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#parse", parseStart); // If there was a parse error, report it: solr.addField(SolrFields.PARSE_ERROR, metadata.get(TikaExtractor.TIKA_PARSE_EXCEPTION)); final long extractStart = System.nanoTime(); // Copy the body text, forcing a UTF-8 encoding: String output = new String(content.toString().getBytes("UTF-8")); if (runner.complete || !output.equals("")) { if (output.length() > this.max_text_length) { output = output.substring(0, this.max_text_length); } log.debug("Extracted text from: " + url); log.debug("Extracted text: " + StringUtils.left(output, 300)); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT, output); solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_LENGTH, Integer.toString(output.length())); } else { //log.debug("Failed to extract any text from: "+url); } // Noisily report all metadata properties: /* * for( String m : metadata.names() ) { * log.info("For "+url.substring(url.length() - (int) * Math.pow(url.length(),0.85))+": "+m+" -> "+metadata.get(m)); } */ // Attempt to record all metadata discovered: if (this.extractAllMetadata) { for (String m : metadata.names()) { // Ignore these as they are not very interesting: if (Metadata.RESOURCE_NAME_KEY.equalsIgnoreCase(m) || "dc:title".equalsIgnoreCase(m) || "title".equalsIgnoreCase(m) || "description".equalsIgnoreCase(m) || "keywords".equalsIgnoreCase(m) || Metadata.CONTENT_ENCODING.equalsIgnoreCase(m) || Metadata.CONTENT_LOCATION.equalsIgnoreCase(m) || "ACTINICTITLE".equalsIgnoreCase(m) || Metadata.CONTENT_TYPE.equalsIgnoreCase(m)) { continue; } // Record in the document, but trim big ones: String value = metadata.get(m); if (value != null && value.length() > 100) { value = value.substring(0, 100); } solr.addField(SolrFields.SOLR_TIKA_METADATA, m + "=" + value); } } // Also Pick out particular metadata: String contentType = metadata.get(Metadata.CONTENT_TYPE); solr.addField(SolrFields.SOLR_CONTENT_TYPE, contentType); solr.addField(SolrFields.SOLR_TITLE, metadata.get(DublinCore.TITLE)); solr.addField(SolrFields.SOLR_DESCRIPTION, metadata.get(DublinCore.DESCRIPTION)); solr.addField(SolrFields.SOLR_KEYWORDS, metadata.get("keywords")); solr.addField(SolrFields.SOLR_AUTHOR, metadata.get(DublinCore.CREATOR)); solr.addField(SolrFields.CONTENT_ENCODING, metadata.get(Metadata.CONTENT_ENCODING)); // Parse out any embedded date that can act as a created/modified date. String date = null; if (metadata.get(Metadata.CREATION_DATE) != null) date = metadata.get(Metadata.CREATION_DATE); if (metadata.get(Metadata.DATE) != null) date = metadata.get(Metadata.DATE); if (metadata.get(Metadata.MODIFIED) != null) date = metadata.get(Metadata.MODIFIED); if (date != null) { DateTimeFormatter df = ISODateTimeFormat.dateTimeParser(); DateTime edate = null; try { edate = df.parseDateTime(date); } catch (IllegalArgumentException e) { log.error("Could not parse: " + date); } if (edate == null) { Date javadate = Times.extractDate(date); if (javadate != null) edate = new org.joda.time.DateTime(javadate); } if (edate != null) { solr.addField(SolrFields.LAST_MODIFIED_YEAR, "" + edate.getYear()); DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC); // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED, // edate); solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate)); } } // Also look to record the software identifiers: // Look for generic xmp:CreatorTool solr.addField(SolrFields.GENERATOR, metadata.get("xmp:CreatorTool")); // For PDF, support other metadata tags: //solr.addField(SolrFields.GENERATOR, metadata.get( "creator" )); // This appears to be dc:creator i.e. author. solr.addField(SolrFields.GENERATOR, metadata.get("producer")); solr.addField(SolrFields.GENERATOR, metadata.get(Metadata.SOFTWARE)); solr.addField(SolrFields.GENERATOR, metadata.get("generator")); solr.addField(SolrFields.GENERATOR, metadata.get("Software")); // Application ID, MS Office only AFAICT, and the VERSION is only doc String software = null; if (metadata.get(Metadata.APPLICATION_NAME) != null) software = metadata.get(Metadata.APPLICATION_NAME); if (metadata.get(Metadata.APPLICATION_VERSION) != null) software += " " + metadata.get(Metadata.APPLICATION_VERSION); // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software', // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71' String png_textentry = metadata.get("tEXt tEXtEntry"); if (png_textentry != null && png_textentry.contains("keyword=Software, value=")) software = png_textentry.replace("keyword=Software, value=", ""); /* Some JPEGs have this: Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality */ if (software != null) { solr.addField(SolrFields.GENERATOR, software); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#tikasolrextract", "TikaExtractor.extract#extract", extractStart); } catch (Exception e) { log.error("TikaExtractor.extract(): " + e.getMessage()); } // TODO: This should probably be wrapped in a method-spanning try-finally to guarantee close if (tikainput != null) { try { tikainput.close(); } catch (IOException e) { log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage()); } } return solr; }