com.google.enterprise.adaptor.experimental.Sim.java Source code

Introduction

Here is the source code for com.google.enterprise.adaptor.experimental.Sim.java
Source

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.adaptor.experimental;

import com.google.enterprise.adaptor.IOHelper;

import com.sun.net.httpserver.HttpExchange;
import com.sun.net.httpserver.HttpHandler;
import com.sun.net.httpserver.HttpServer;

import org.apache.commons.fileupload.FileItemIterator;
import org.apache.commons.fileupload.FileItemStream;
import org.apache.commons.fileupload.FileUpload;
import org.apache.commons.fileupload.FileUploadException;
import org.apache.commons.fileupload.RequestContext;
import org.apache.commons.fileupload.UploadContext;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

/** Accepts adaptor feeds and issues requests for documents. */
public class Sim implements Runnable {
    private static Logger log = Logger.getLogger(Sim.class.getName());
    static final Charset UTF8 = Charset.forName("UTF-8");

    private static class Index {
        Set<URL> urls = new HashSet<URL>();
        Map<URL, byte[]> content = new HashMap<URL, byte[]>();
        Map<URL, String> type = new HashMap<URL, String>(); // could be null
        Map<URL, Map<String, String>> meta = new HashMap<URL, Map<String, String>>();
    }

    private Index index = new Index(); // contains contents and metadata

    private void startFeedAcceptor() throws IOException {
        log.info("starting feed acceptor");
        HttpServer server = HttpServer.create();
        int useDefaultBacklog = -1;
        server.bind(new InetSocketAddress(19900), useDefaultBacklog);
        server.createContext("/xmlfeed", new FeedAcceptor());
        server.start();
        log.info("started feed acceptor");
    }

    private void startCrawler() {
        log.info("starting crawler");
        new Thread(new Crawler()).start();
        log.info("started crawler");
    }

    public void run() {
        try {
            startFeedAcceptor();
        } catch (IOException ie) {
            throw new RuntimeException("failed to start feed acceptor", ie);
        }
        startCrawler();
        // TODO: add serving
        // TODO: add stops to exit
    }

    public static void main(String args[]) {
        new Sim().run();
    }

    /** Takes multipart POST with metadata-and-url xml feed. */
    private class FeedAcceptor implements HttpHandler {
        @Override
        public void handle(HttpExchange ex) throws IOException {
            log.info("in feed acceptor");
            String requestMethod = ex.getRequestMethod();
            if (!"POST".equals(requestMethod)) {
                log.info("received non-post method: " + requestMethod);
                respond(ex, HttpURLConnection.HTTP_BAD_METHOD, "text/plain",
                        "server accepts POST only".getBytes(UTF8));
            } else {
                URI req = com.google.enterprise.adaptor.HttpExchanges.getRequestUri(ex);
                log.info("received post on path: " + req.getPath());
                processMultipartPost(ex);
            }
        }
    }

    private static class NoXmlFound extends Exception {
    }

    private static class BadFeed extends Exception {
        BadFeed(String emsg) {
            super(emsg);
        }
    }

    /** Periodically acquires new content and metadata for each URL. */
    private class Crawler implements Runnable {
        private Set<URL> dupIndexUrls() {
            synchronized (index.urls) {
                return new HashSet<URL>(index.urls);
            }
        }

        public void run() {
            for (;;) {
                log.info("crawler about to hibernate");
                try {
                    Thread.sleep(1000 * 20);
                } catch (InterruptedException terup) {
                    log.info("crawler awoken early");
                }
                log.info("crawler is awake");
                for (URL doc : dupIndexUrls()) {
                    crawl(doc);
                }
            }
        }

        private void crawl(URL doc) {
            log.info("about to crawl: " + doc);
            try {
                URLConnection con = doc.openConnection();
                byte content[] = IOHelper.readInputStreamToByteArray(con.getInputStream());
                index.content.put(doc, content);
                Map<String, List<String>> headers = con.getHeaderFields();
                List<String> ct = headers.get("Content-type");
                index.type.put(doc, (null != ct && ct.size() > 0) ? ct.get(0) : null);
                index.meta.put(doc, parseMeta(headers.get("X-gsa-external-metadata")));
                for (String k : headers.keySet()) {
                    log.info("header: " + k + ":" + headers.get(k));
                }
                log.info("crawled: " + doc);
            } catch (IOException ie) {
                log.info("failed getting: " + doc);
            }
        }

        private Map<String, String> parseMeta(List<String> metadata) {
            Map<String, String> map = new HashMap<String, String>();
            if (null != metadata) {
                for (String m : metadata) {
                    if (null != m) {
                        parseMeta(map, m);
                    }
                }
            }
            return map;
        }

        private void parseMeta(Map<String, String> map, String metadatum) {
            String metadatums[] = metadatum.split(",", 0);
            switch (metadatums.length) {
            case 0:
                break;
            case 1: // really have single metadatum
                String m = metadatums[0];
                int splitPoint = m.indexOf('=');
                if (-1 == splitPoint) {
                    log.info("skipping metadatum: " + m);
                    return;
                }
                String key = m.substring(0, splitPoint);
                String value = m.substring(splitPoint + 1);
                key = percentDecode(key);
                value = percentDecode(value);
                log.info("key: " + key);
                log.info("value: " + value);
                map.put(key, value);
                break;
            default: // have multiple pieces split by comma
                for (String p : metadatums) {
                    parseMeta(map, p);
                }
            }
        }
    }

    private void processMultipartPost(HttpExchange ex) throws IOException {
        InputStream inStream = ex.getRequestBody();
        String encoding = ex.getRequestHeaders().getFirst("Content-encoding");
        if (null != encoding && "gzip".equals(encoding.toLowerCase())) {
            inStream = new GZIPInputStream(inStream);
        }
        String lens = ex.getRequestHeaders().getFirst("Content-Length");
        long len = (null != lens) ? Long.parseLong(lens) : 0;
        String ct = ex.getRequestHeaders().getFirst("Content-Type");
        try {
            String xml = extractFeedFromMultipartPost(inStream, len, ct);
            processXml(xml);
            respond(ex, HttpURLConnection.HTTP_OK, "text/plain", "Success".getBytes(UTF8));
        } catch (NoXmlFound nox) {
            log.warning("failed to find xml");
            respond(ex, HttpURLConnection.HTTP_BAD_REQUEST, "text/plain", "xml beginning not found".getBytes(UTF8));
        } catch (SAXException saxe) {
            log.warning("sax error: " + saxe.getMessage());
            respond(ex, HttpURLConnection.HTTP_BAD_REQUEST, "text/plain", "sax not liking the xml".getBytes(UTF8));
        } catch (ParserConfigurationException confige) {
            log.warning("parser error: " + confige.getMessage());
            respond(ex, HttpURLConnection.HTTP_BAD_REQUEST, "text/plain", "parser error".getBytes(UTF8));
        } catch (BadFeed bad) {
            log.warning("error in feed: " + bad.getMessage());
            respond(ex, HttpURLConnection.HTTP_BAD_REQUEST, "text/plain", bad.getMessage().getBytes(UTF8));
        }
    }

    private void processXml(String xml) throws SAXException, ParserConfigurationException, BadFeed {
        Set<URL> tmpUrls = extractUrls(xml);
        synchronized (index.urls) {
            index.urls.addAll(tmpUrls);
        }
    }

    static String extractFeedFromMultipartPost(InputStream in, long len, String contentType) throws NoXmlFound {
        HttpExchangeUploadInfo uploadInfo = new HttpExchangeUploadInfo(in, len, contentType);
        try {
            Map<String, byte[]> parts = splitMultipartRequest(uploadInfo);
            if (!parts.containsKey("data")) {
                throw new NoXmlFound();
            }
            return new String(parts.get("data"), UTF8);
        } catch (IOException ie) {
            throw new NoXmlFound();
        }
    }

    /** Find all record urls in Adaptor created XML metadata-and-url feed file. */
    static Set<URL> extractUrls(String xml) throws SAXException, ParserConfigurationException, BadFeed {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        /* to avoid blowing up on doctype line:
         * http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references */
        dbf.setValidating(false);
        dbf.setFeature("http://xml.org/sax/features/namespaces", false);
        dbf.setFeature("http://xml.org/sax/features/validation", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        DocumentBuilder db = dbf.newDocumentBuilder();
        InputStream xmlStream = new ByteArrayInputStream(xml.getBytes(UTF8));
        Document doc;
        try {
            doc = db.parse(xmlStream);
        } catch (IOException ie) {
            throw new BadFeed(ie.getMessage());
        }
        doc.getDocumentElement().normalize();
        NodeList nodes = doc.getElementsByTagName("record");
        Set<URL> tmpUrls = new HashSet<URL>();
        for (int i = 0; i < nodes.getLength(); i++) {
            Element element = (Element) nodes.item(i);
            String url = element.getAttribute("url");
            if (null == url || url.trim().isEmpty()) {
                throw new BadFeed("record without url attribute");
            } else {
                try {
                    tmpUrls.add(new URL(url));
                } catch (MalformedURLException male) {
                    throw new BadFeed("record with bad url attribute: " + url);
                }
            }
            log.info("accepting url: " + url);
        }
        return tmpUrls;
    }

    /** Send some response body. */
    private static void respond(HttpExchange ex, int code, String contentType, byte response[]) throws IOException {
        ex.getResponseHeaders().set("Content-Type", contentType);
        ex.sendResponseHeaders(code, 0);
        OutputStream responseBody = ex.getResponseBody();
        log.finest("before writing response");
        responseBody.write(response);
        responseBody.flush();
        responseBody.close();
        ex.close();
        log.finest("after closing exchange");
    }

    /** Intermediary from an HttpExchange to FileUpload input. */
    private static class HttpExchangeUploadInfo implements UploadContext {
        InputStream inStream;
        long length;
        String contentType;

        HttpExchangeUploadInfo(InputStream is, long len, String ct) {
            this.inStream = is;
            this.length = len;
            this.contentType = ct;
        }

        public String getCharacterEncoding() {
            // TODO: get from exchange?
            return "UTF-8";
        }

        public String getContentType() {
            return contentType;
        }

        @Deprecated
        public int getContentLength() {
            return (int) length;
        }

        public long contentLength() {
            return length;
        }

        public InputStream getInputStream() throws IOException {
            return inStream;
        }
    }

    static Map<String, byte[]> splitMultipartRequest(RequestContext req) throws IOException {
        Map<String, byte[]> parts = new HashMap<String, byte[]>();
        try {
            FileUpload upload = new FileUpload();
            FileItemIterator iterator = upload.getItemIterator(req);
            while (iterator.hasNext()) {
                FileItemStream item = iterator.next();
                String field = item.getFieldName();
                byte value[] = IOHelper.readInputStreamToByteArray(item.openStream());
                parts.put(field, value);
            }
            return parts;
        } catch (FileUploadException e) {
            throw new IOException("caught FileUploadException", e);
        }
    }

    private static int hexToInt(byte b) {
        if (b >= '0' && b <= '9') {
            return (byte) (b - '0');
        } else if (b >= 'a' && b <= 'f') {
            return (byte) (b - 'a') + 10;
        } else if (b >= 'A' && b <= 'F') {
            return (byte) (b - 'A') + 10;
        } else {
            throw new IllegalArgumentException("invalid hex byte: " + b);
        }
    }

    public static String percentDecode(String encoded) {
        try {
            byte bytes[] = encoded.getBytes("ASCII");
            ByteArrayOutputStream decoded = percentDecode(bytes);
            return decoded.toString("UTF-8");
        } catch (UnsupportedEncodingException uee) {
            throw new RuntimeException(uee);
        }
    }

    static ByteArrayOutputStream percentDecode(byte encoded[]) {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        int i = 0;
        while (i < encoded.length) {
            byte b = encoded[i];
            if (b == '%') {
                int iNeeded = i + 2; // need two more bytes
                if (iNeeded >= encoded.length) {
                    throw new IllegalArgumentException("ends too early");
                }
                int highOrder = hexToInt(encoded[i + 1]);
                int lowOrder = hexToInt(encoded[i + 2]);
                int byteInInt = (highOrder << 4) | lowOrder;
                b = (byte) byteInInt; // chops top bytes; could make negative
                i += 3;
            } else if ((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '-'
                    || b == '_' || b == '.' || b == '~') {
                // pass through
                i++;
            } else {
                throw new IllegalArgumentException("not percent encoded");
            }
            out.write(b);
        }
        return out;
    }
}