List of usage examples for org.xml.sax.helpers DefaultHandler DefaultHandler
DefaultHandler
From source file:com.shin1ogawa.appengine.marketplace.gdata.LicensingAPI.java
static List<Map<String, String>> parseLicenseFeed(InputStream is) throws SAXException, IOException, ParserConfigurationException { SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); final List<Map<String, String>> list = Lists.newArrayList(); parser.parse(is, new DefaultHandler() { boolean entity = false; String currentElement;//from w w w .j av a 2 s.co m Map<String, String> map; @Override public void characters(char[] ch, int start, int length) { if (entity) { map.put(currentElement, new String(ch, start, length)); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) { if (entity == false && StringUtils.equals(qName, "entity")) { entity = true; map = Maps.newHashMap(); list.add(map); return; } currentElement = qName; } @Override public void endElement(String uri, String localName, String qName) { if (entity && StringUtils.equals(qName, "entity")) { entity = false; } } }); return list; }
From source file:com.zegoggles.smssync.XOAuthConsumer.java
protected String getUsernameFromContacts() { final HttpClient httpClient = new DefaultHttpClient(); final String url = "https://www.google.com/m8/feeds/contacts/default/thin?max-results=1"; final StringBuilder email = new StringBuilder(); try {//from w w w . j a va2s .c om HttpGet get = new HttpGet(sign(url)); HttpResponse resp = httpClient.execute(get); SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser sp = spf.newSAXParser(); XMLReader xr = sp.getXMLReader(); xr.setContentHandler(new DefaultHandler() { boolean inEmail; @Override public void startElement(String uri, String localName, String qName, Attributes atts) { inEmail = "email".equals(localName); } @Override public void characters(char[] c, int start, int length) { if (inEmail) { email.append(c, start, length); } } }); xr.parse(new InputSource(resp.getEntity().getContent())); return email.toString(); } catch (oauth.signpost.exception.OAuthException e) { Log.e(TAG, "error", e); return null; } catch (org.xml.sax.SAXException e) { Log.e(TAG, "error", e); return null; } catch (java.io.IOException e) { Log.e(TAG, "error", e); return null; } catch (javax.xml.parsers.ParserConfigurationException e) { Log.e(TAG, "error", e); return null; } }
From source file:com.nidhinova.tika.server.TikaService.java
/** * Serves HTTP GET Returns metadata formatted as json or plain text content * of the file. File should be locally accessible for Tika Server using * pathkey JNDI/*w w w . j a v a2s . co m*/ * * @param filename * @param pathkey * (JNDI lookup key) * @param opkey * (can be "text" or "metadata" or "fulldata") * @param httpHeaders * @return * @throws Exception */ @GET @Produces({ MediaType.APPLICATION_JSON }) @Path("/{opkey}/{pathkey}/{resourceid: .*}") public StreamingOutput getMetadata(@javax.ws.rs.core.Context javax.ws.rs.core.UriInfo uriInfo, @PathParam("opkey") final String opkey, @PathParam("pathkey") final String pathkey, @PathParam("resourceid") final String resourceId, @Context HttpHeaders httpHeaders) throws Exception { // get the resource segment, this may have query params // we are ok with it as long as we can get something at that location String[] segments = uriInfo.getRequestUri().toASCIIString().split("/" + opkey + "/" + pathkey + "/"); final String filename = segments[segments.length - 1]; logger.info("resource :" + segments[segments.length - 1]); final Detector detector = createDetector(httpHeaders); final AutoDetectParser parser = new AutoDetectParser(detector); final ParseContext context = new ParseContext(); context.set(Parser.class, parser); final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata(); setMetadataFromHeader(parser, metadata, httpHeaders); URL url = null; try { if (pathkey != null && resourceId != null) { String filepath = getFilePath(pathkey) + filename; File file = new File(filepath); if (file.isFile()) { url = file.toURI().toURL(); } else { url = new URL(filepath); } } } catch (MalformedURLException mex) { throw new WebApplicationException(Response.Status.NOT_FOUND); } final InputStream is = TikaInputStream.get(url, metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { StringWriter textBuffer = new StringWriter(); ContentHandler handler = null; if (opkey.equalsIgnoreCase("metadata")) { handler = new DefaultHandler(); } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) { handler = new BodyContentHandler(textBuffer); } try { parser.parse(is, handler, metadata, context); String contentEncoding = (metadata .get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING) == null ? "UTF-8" : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING)); logger.info("Content encoding: " + metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING)); Writer outWriter = getOutputWriter(outputStream, contentEncoding); //metadata is always gathered // munch tika metadata object it to make json String jsonMetadata = JSONHelper.metadataToJson(metadata); if (opkey.equalsIgnoreCase("metadata")) { outWriter.write("{\"metadata\":" + jsonMetadata + "}"); } else if (opkey.equalsIgnoreCase("text")) { // write it out outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); } else if (opkey.equalsIgnoreCase("fulldata")) { StringBuilder data = new StringBuilder(); data.append("{ \"metadata\":" + jsonMetadata).append(", ") .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); outWriter.write(data.toString()); } outWriter.flush(); } catch (SAXException e) { throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } catch (TikaException e) { if (e.getCause() != null && e.getCause() instanceof WebApplicationException) { throw (WebApplicationException) e.getCause(); } if (e.getCause() != null && e.getCause() instanceof IllegalStateException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) { throw new WebApplicationException(Response.status(422).build()); } logger.warn("Text extraction failed", e); throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } } }; }
From source file:TransformThread.java
/** Initialize the results (m_outResult) according * to RESULT_FLAVOR/*from ww w. j ava2 s.c o m*/ */ private void initResult() { try { for (int i = 0; i < NUM_TRANSFORMATIONS; i++) { switch (RESULT_FLAVOR) { case STREAM: OutputStream outStream = new FileOutputStream( FILE_OUT_BASE + "thread_" + m_thrdNum + "_transformation_" + i + FILE_OUT_EXT); m_outResult[i] = new StreamResult(outStream); break; case SAX: DefaultHandler defaultHandler = new DefaultHandler(); m_outResult[i] = new SAXResult(defaultHandler); break; case DOM: m_outResult[i] = new DOMResult(); break; } } } catch (Exception e) { e.printStackTrace(); System.exit(1); } }
From source file:com.vmware.photon.controller.model.adapters.vsphere.ovf.OvfRetriever.java
private StoringInputStream toStream(URI ovfUri) throws IOException { SAXParser saxParser = newSaxParser(); DefaultHandler handler = new DefaultHandler(); InputStream is;/*from www. ja v a 2s. c om*/ HttpResponse response = null; HttpGet request = null; if (ovfUri.getScheme().equals("file")) { is = new FileInputStream(new File(ovfUri)); } else { request = new HttpGet(ovfUri); response = this.client.execute(request); if (response.getStatusLine().getStatusCode() != 200) { throw new IOException( "Ovf descriptor not found at " + ovfUri + ". Error code " + response.getStatusLine()); } is = response.getEntity().getContent(); } StoringInputStream storingInputStream = new StoringInputStream(is); try { saxParser.parse(storingInputStream, handler); if (response != null) { EntityUtils.consumeQuietly(response.getEntity()); } } catch (SAXException e) { // not a valid ovf - abort if (request != null) { request.abort(); } EntityUtils.consumeQuietly(response.getEntity()); throw new IOException("Ovf not a valid xml: " + e.getMessage(), e); } finally { //close stream, could be file IOUtils.closeQuietly(is); } return storingInputStream; }
From source file:MockFedoraIT.java
private void parseIrodsFile(IrodsIFileSystem module, String testPath) throws LowlevelStorageException { InputStream is = module.read(new File(testPath)); // initialize sax for this parse try {//from ww w . j av a 2s .co m SAXParserFactory spf = SAXParserFactory.newInstance(); // spf.setValidating(false); // spf.setNamespaceAware(true); SAXParser parser = spf.newSAXParser(); parser.parse(is, new DefaultHandler()); } catch (Exception e) { throw new RuntimeException("Error with SAX parser", e); } }
From source file:at.molindo.webtools.crawler.CrawlerTask.java
protected void parseResult(final String string) throws SAXException, IOException { InputSource inputSource;//from w w w. ja v a 2s . c o m if (_tidy) { final Tidy tidy = new Tidy(); tidy.setXHTML(true); tidy.setErrfile("/dev/null"); final ByteArrayInputStream in = new ByteArrayInputStream(string.getBytes()); final ByteArrayOutputStream out = new ByteArrayOutputStream(); tidy.parse(in, out); inputSource = new InputSource(new ByteArrayInputStream(out.toByteArray())); } else { inputSource = new InputSource(new StringReader(string)); } ((CrawlerThread) Thread.currentThread()).getParser().parse(inputSource, new DefaultHandler() { @Override public void startElement(final String uri, final String localName, final String name, final Attributes attributes) throws SAXException { if ("a".equals(name)) { String href = attributes.getValue("href"); if (href != null) { final int anchorIndex = href.lastIndexOf("#"); if (anchorIndex > 0) { href = href.substring(0, anchorIndex); } else if (anchorIndex == 0) { // anchor on same page: ignore return; } if (href != null) { final CrawlerReferrer referrer = new CrawlerReferrer(_urlString, href); if (!href.startsWith("http://")) { if (href.startsWith("/")) { _crawler.queue(_crawler._host + href.substring(1), referrer); } else if (!href.startsWith("javascript:") && !href.startsWith("ftp:") && !href.startsWith("mailto:")) { String relativeTo = _urlString.substring(0, _urlString.lastIndexOf("/")); boolean one = false, two = false; while ((two = href.startsWith("../")) || (one = href.startsWith("./"))) { if (two) { href = href.substring(3); relativeTo = relativeTo.substring(0, relativeTo.lastIndexOf("/")); } else if (one) { href = href.substring(2); } } _crawler.queue(relativeTo + "/" + href, referrer); } } else if (href.startsWith(_crawler._host)) { _crawler.queue(href, referrer); } } } } } @Override public InputSource resolveEntity(final String publicId, String systemId) throws IOException, SAXException { if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) { systemId = getClass().getClassLoader().getResource("xhtml1-transitional.dtd").toString(); } return _crawler.getDtdMemoryCache().resolveEntity(publicId, systemId); } }); }
From source file:crawler.configuration.rabbitmq.RabbitMQConfiguration.java
public void onMessage(Message message) { String url = new String(message.getBody()); log.info("parse(" + url + ").."); CloseableHttpResponse closeableHttpResponse = null; try {/*from w w w.j av a2s. c o m*/ closeableHttpResponse = socksSocketService.connect(url); if (closeableHttpResponse.getStatusLine().getStatusCode() == 200) { long length = closeableHttpResponse.getEntity().getContentLength(); Header contentType = closeableHttpResponse.getEntity().getContentType(); InputStream inputStream = closeableHttpResponse.getEntity().getContent(); TikaInputStream tikaInputStream = TikaInputStream.get(inputStream); Metadata metadata = new Metadata(); metadata.add(Metadata.RESOURCE_NAME_KEY, url); URL u = new URL(metadata.get(Metadata.RESOURCE_NAME_KEY)); metadata.add(Metadata.CONTENT_TYPE, contentType.getValue()); Detector detector = tikaConfig.getDetector(); MediaType mediaType = detector.detect(tikaInputStream, metadata); List<Page> pages = mongoPageRepository.findByUrl(u); // String text = bodyContentHandler.toString(); Page page = null; if (pages.size() == 0) { // manually inserted page page = new Page(); page.setInsertDate(new Date()); page.setUrl(u); log.info(" .. manual(" + url + ")"); } else { if (pages.size() > 1) { log.warn("url(" + url + ") multiple times in db!"); } // from html link extractor page = pages.get(0); page.setUpdateDate(new Date()); } ParseContext parseContext = new ParseContext(); LinkContentHandler linkContentHandler = new LinkContentHandler(); BodyContentHandler bodyContentHandler = new BodyContentHandler(MAX_BODY_LENGTH); ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler(); HtmlParser htmlParser = new HtmlParser(); DefaultHandler defaultHandler = new DefaultHandler(); TeeContentHandler teeContentHandler = new TeeContentHandler(linkContentHandler, bodyContentHandler, toHTMLContentHandler); AutoDetectParser autoDetectParser = new AutoDetectParser(); autoDetectParser.parse(tikaInputStream, teeContentHandler, metadata); log.info("autodetect: defaultHandler(" + gson.toJson(defaultHandler) + "), metadata(" + gson.toJson(metadata) + "), mediaType(" + gson.toJson(autoDetectParser.getMediaTypeRegistry()) + "), parseContext(" + gson.toJson(parseContext) + "): content(" + gson.toJson(teeContentHandler) + ")"); if ("application".equals(mediaType.getType())) { if ("xhtml+xml".equals(mediaType.getSubtype())) { htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext); String html = toHTMLContentHandler.toString(); page.setContent(html.toString().getBytes()); } else { log.warn("this contentType(" + mediaType.toString() + ") not supported!"); } } else if ("text".equals(mediaType.getType())) { if ("html".equals(mediaType.getSubtype())) { htmlParser.parse(tikaInputStream, teeContentHandler, metadata, parseContext); String html = toHTMLContentHandler.toString(); page.setContent(html.toString().getBytes()); } else { log.warn("this contentType(" + mediaType.toString() + ") not supported!"); } } page.setContent(IOUtils.toByteArray(tikaInputStream)); page.setMetadata(metadata); mongoPageRepository.save(page); } else { log.warn("url(" + url + "): statuscode(" + closeableHttpResponse.getStatusLine().getStatusCode() + ")"); } EntityUtils.consume(closeableHttpResponse.getEntity()); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } finally { if (closeableHttpResponse != null) { try { closeableHttpResponse.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:mj.ocraptor.extraction.tika.parser.odf.OpenDocumentParser.java
public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // TODO: reuse the already opened ZIPFile, if // present/* ww w . ja va2s .c o m*/ /* * ZipFile zipFile; if (stream instanceof TikaInputStream) { TikaInputStream * tis = (TikaInputStream) stream; Object container = ((TikaInputStream) * stream).getOpenContainer(); if (container instanceof ZipFile) { zipFile = * (ZipFile) container; } else if (tis.hasFile()) { zipFile = new * ZipFile(tis.getFile()); } } */ // TODO: if incoming IS is a TIS with a file // associated, we should open ZipFile so we can // visit metadata, mimetype first; today we lose // all the metadata if meta.xml is hit after // content.xml in the stream. Then we can still // read-once for the content.xml. XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); // As we don't know which of the metadata or the content // we'll hit first, catch the endDocument call initially EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml); TikaImageHelper helper = new TikaImageHelper(metadata); try { // Process the file in turn ZipInputStream zip = new ZipInputStream(stream); ZipEntry entry = zip.getNextEntry(); while (entry != null) { // TODO: images String entryExtension = null; try { entryExtension = FilenameUtils.getExtension(new File(entry.getName()).getName()); } catch (Exception e) { e.printStackTrace(); } if (entryExtension != null && FileType.isValidImageFileExtension(entryExtension) && Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) { File imageFile = null; try { imageFile = TikaImageHelper.saveZipEntryToTemp(zip, entry); helper.addImage(imageFile); } catch (Exception e) { e.printStackTrace(); } finally { if (imageFile != null) { imageFile.delete(); } } } else if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, "UTF-8"); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("meta.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } else if (entry.getName().endsWith("styles.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } entry = zip.getNextEntry(); } helper.addTextToHandler(xhtml); } catch (Exception e) { LOG.info("Extract error", e); } finally { if (helper != null) { helper.close(); } } // Only now call the end document if (handler.getEndDocumentWasCalled()) { handler.reallyEndDocument(); } }