edu.umd.cloud9.webgraph.driver.ClueWebAnchorTextForwardIndexHttpServer.java Source code

Java tutorial

Introduction

Here is the source code for edu.umd.cloud9.webgraph.driver.ClueWebAnchorTextForwardIndexHttpServer.java

Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.webgraph.driver;

import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Random;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.servlet.Context;
import org.mortbay.jetty.servlet.ServletHolder;

import edu.umd.cloud9.collection.DocumentForwardIndex;
import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.mapred.NullInputFormat;
import edu.umd.cloud9.mapred.NullMapper;
import edu.umd.cloud9.mapred.NullOutputFormat;
import edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex;

/**
 * <p>
 * Runs an HTTP server to explore an anchor text/web graph collection. 
 * Command-line arguments are as follows:
 * </p>
 * 
 * <ul>
 * <li>[index-file]: the forward index file for the anchor text collection</li>
 * <li>[docno-mapping]: ClueWeb09 docno mapping file</li>
 * <li>[ClueWeb09-index-files-base]: the path to ClueWeb09 forward indexes</li>
 * </ul>
 * 
 * <p>
 * ClueWeb09 forward indexes should be stored at <code>[ClueWeb09-index-files-base]/findex.en.XX</code>, 
 * where <code>XX</code> is the segment number.
 * </p>
 * 
 * @author Nima Asadi
 * 
 */

@SuppressWarnings("deprecation")
public class ClueWebAnchorTextForwardIndexHttpServer {

    private static final Logger LOG = Logger.getLogger(ClueWebAnchorTextForwardIndexHttpServer.class);
    private static final String SEPARATOR = ",";

    private static final int[] lastDocs = new int[10];
    private static final ArrayList<String> clueweb = new ArrayList<String>();

    private static IndexableAnchorTextForwardIndex sForwardIndex;
    private static DocumentForwardIndex<Indexable>[] docForwardIndex;

    @SuppressWarnings("unchecked")
    private static class ServerMapper extends NullMapper {
        public void run(JobConf conf, Reporter reporter) throws IOException {
            int port = 8888;

            String indexFile = conf.get("IndexFile");
            String mappingFile = conf.get("DocnoMappingDataFile");
            Path tmpPath = new Path(conf.get("TmpPath"));
            String[] cluewebIndexFiles = conf.get("ClueWebIndexFiles").split(SEPARATOR);

            String host = InetAddress.getLocalHost().toString();

            LOG.info("host: " + host);
            LOG.info("port: " + port);
            LOG.info("forward index: " + indexFile);

            FSDataInputStream in = FileSystem.get(conf).open(new Path(indexFile));
            String indexClass = in.readUTF();
            in.close();

            LOG.info("index class: " + indexClass);

            try {
                sForwardIndex = new IndexableAnchorTextForwardIndex(new ClueWarcDocnoMapping());
                sForwardIndex.loadIndex(new Path(indexFile), new Path(mappingFile), FileSystem.get(conf));
            } catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing forward index!");
            }

            for (String s : cluewebIndexFiles)
                clueweb.add(s.trim());

            //opening clueweb forward index files
            docForwardIndex = new DocumentForwardIndex[clueweb.size()];

            for (int i = 0; i < clueweb.size(); i++) {
                in = FileSystem.get(conf).open(new Path(clueweb.get(i)));
                String indexClueWebClass = in.readUTF();
                in.close();
                try {
                    docForwardIndex[i] = (DocumentForwardIndex<Indexable>) Class.forName(indexClueWebClass)
                            .newInstance();
                    docForwardIndex[i].loadIndex(new Path(clueweb.get(i)), new Path(mappingFile),
                            FileSystem.get(conf));
                    lastDocs[i] = docForwardIndex[i].getLastDocno();

                } catch (Exception e) {
                    e.printStackTrace();
                    throw new RuntimeException("Error initializing forward index!");
                }
            }

            Server server = new Server(port);
            Context root = new Context(server, "/", Context.SESSIONS);
            root.addServlet(new ServletHolder(new FetchDocidServlet()), "/fetch_docid");
            root.addServlet(new ServletHolder(new FetchDocnoServlet()), "/fetch_docno");
            root.addServlet(new ServletHolder(new FetchDocContentServlet()), "/fetch_content");
            root.addServlet(new ServletHolder(new HomeServlet()), "/");

            FSDataOutputStream out = FileSystem.get(conf).create(tmpPath, true);
            out.writeUTF(host);
            out.close();

            try {
                server.start();
            } catch (Exception e) {
                e.printStackTrace();
            }

            while (true)
                ;
        }
    }

    private ClueWebAnchorTextForwardIndexHttpServer() {
    }

    // this has to be public
    public static class HomeServlet extends HttpServlet {

        static final long serialVersionUID = 8253865405L;
        static final Random r = new Random();

        public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            res.setContentType("text/html");
            PrintWriter out = res.getWriter();

            out.println("<html><head><title>Collection Access: " + sForwardIndex.getCollectionPath()
                    + "</title><head>");
            out.println("<body>");

            out.println("<h3>Collection Access: " + sForwardIndex.getCollectionPath() + "</h3>");

            int firstDocno = sForwardIndex.getFirstDocno();
            int lastDocno = sForwardIndex.getLastDocno();
            int numDocs = lastDocno - firstDocno;

            String firstDocid = sForwardIndex.getDocid(firstDocno);
            String lastDocid = sForwardIndex.getDocid(lastDocno);

            out.println("First document: docno <a href=\"/fetch_docno?docno=" + firstDocno + "\">" + firstDocno
                    + "</a> or <a href=\"/fetch_docid?docid=" + firstDocid + "\">" + firstDocid + "</a><br/>");
            out.println("Last document: docno <a href=\"/fetch_docno?docno=" + lastDocno + "\">" + lastDocno
                    + "</a> or <a href=\"/fetch_docid?docid=" + lastDocid + "\">" + lastDocid + "</a>");

            out.println("<h3>Fetch a docid</h3>");

            String id;

            out.println("<p>(random examples: ");

            id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
            out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, ");

            id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
            out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>, ");

            id = sForwardIndex.getDocid(r.nextInt(numDocs) + firstDocno);
            out.println("<a href=\"/fetch_docid?docid=" + id + "\">" + id + "</a>)</p>");

            out.println("<form method=\"post\" action=\"fetch_docid\">");
            out.println("<input type=\"text\" name=\"docid\" size=\"60\" />");
            out.println("<input type=\"submit\" value=\"Fetch!\" />");
            out.println("</form>");
            out.println("</p>");

            out.println("<p>");
            out.println("<form method=\"post\" action=\"fetch_content\">");
            out.println("<input type=\"text\" name=\"docid\" size=\"60\" />");
            out.println("<input type=\"submit\" value=\"Fetch Content!\" />");
            out.println("</form>");
            out.println("</p>");

            out.println("<h3>Fetch a docno</h3>");

            int n;
            out.println("<p>(random examples: ");

            n = r.nextInt(numDocs) + firstDocno;
            out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, ");

            n = r.nextInt(numDocs) + firstDocno;
            out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>, ");

            n = r.nextInt(numDocs) + firstDocno;
            out.println("<a href=\"/fetch_docno?docno=" + n + "\">" + n + "</a>)</p>");

            out.println("<p>");
            out.println("<form method=\"post\" action=\"fetch_docno\">");
            out.println("<input type=\"text\" name=\"docno\" size=\"60\" />");
            out.println("<input type=\"submit\" value=\"Fetch!\" />");
            out.println("</form>");
            out.println("</p>");

            out.println("<p>");
            out.println("<form method=\"post\" action=\"fetch_content\">");
            out.println("<input type=\"text\" name=\"docno\" size=\"60\" />");
            out.println("<input type=\"submit\" value=\"Fetch Content!\" />");
            out.println("</form>");
            out.println("</p>");

            out.print("</body></html>\n");

            out.close();
        }
    }

    public static class FetchDocContentServlet extends HttpServlet {
        static final long serialVersionUID = 5970126341L;

        public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            doPost(req, res);
        }

        public void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            LOG.info("triggered servlet for fetching document content");

            int docno = 0;
            try {
                if (req.getParameterValues("docno") != null)
                    docno = Integer.parseInt(req.getParameterValues("docno")[0]);
                else if (req.getParameterValues("docid") != null)
                    docno = sForwardIndex.getDocno(req.getParameterValues("docid")[0]);

                Indexable doc = null;
                int i = 0;
                for (i = 0; i < lastDocs.length; i++)
                    if (docno <= lastDocs[i]) {
                        doc = docForwardIndex[i].getDocument(docno);
                        break;
                    }

                if (doc != null) {
                    LOG.info("fetched: " + doc.getDocid() + " = docno " + docno);
                    res.setContentType(doc.getDisplayContentType());

                    PrintWriter out = res.getWriter();
                    out.print(doc.getContent().replaceAll("<\\s*/\\s*[bB][oO][dD][Yy]\\s*>",
                            "<br><br><a href=\"/fetch_docno?docno=" + docno + "\"> Fetch anchor text for docno: "
                                    + docno + "</a></body>"));
                    out.close();
                } else {
                    throw new Exception();
                }
            } catch (Exception e) {
                LOG.info("trapped error fetching " + docno);
                res.setContentType("text/html");

                PrintWriter out = res.getWriter();
                out.print("<html><head><title>Invalid docno!</title><head>\n");
                out.print("<body>\n");
                out.print("<h1>Error!</h1>\n");
                out.print("<h3>Invalid docno: " + docno + "</h3>\n");
                out.print("</body></html>\n");
                out.close();
            }
        }
    }

    // this has to be public
    public static class FetchDocidServlet extends HttpServlet {
        static final long serialVersionUID = 3986721097L;

        public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            doPost(req, res);
        }

        public void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            LOG.info("triggered servlet for fetching document by docid");
            String docid = null;

            try {

                if (req.getParameterValues("docid") != null)
                    docid = req.getParameterValues("docid")[0];

                Indexable doc = sForwardIndex.getDocument(docid);

                if (doc != null) {
                    LOG.info("fetched: " + doc.getDocid());
                    res.setContentType(doc.getDisplayContentType());

                    PrintWriter out = res.getWriter();
                    out.print(doc.getContent().replace("<body>", "<body><a href=\"/fetch_content?docid=" + docid
                            + "\"> Fetch content for docid: " + docid + "</a><br><br>"));
                    out.close();
                } else {
                    throw new Exception();
                }
            } catch (Exception e) {
                // catch-all, in case anything goes wrong
                LOG.info("trapped error fetching " + docid);
                res.setContentType("text/html");

                PrintWriter out = res.getWriter();
                out.print("<html><head><title>Invalid docid!</title><head>\n");
                out.print("<body>\n");
                out.print("<h1>Error!</h1>\n");
                out.print("<h3>Invalid docid: " + docid + "</h3>\n");
                out.print("</body></html>\n");
                out.close();
            }
        }

    }

    // this has to be public
    public static class FetchDocnoServlet extends HttpServlet {
        static final long serialVersionUID = 5970126341L;

        public void doGet(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            doPost(req, res);
        }

        public void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
            LOG.info("triggered servlet for fetching document by docno");

            int docno = 0;
            try {

                if (req.getParameterValues("docno") != null)
                    docno = Integer.parseInt(req.getParameterValues("docno")[0]);

                Indexable doc = sForwardIndex.getDocument(docno);

                if (doc != null) {
                    LOG.info("fetched: " + doc.getDocid() + " = docno " + docno);
                    res.setContentType(doc.getDisplayContentType());

                    PrintWriter out = res.getWriter();
                    out.print(doc.getContent().replace("<body>", "<body><a href=\"/fetch_content?docno=" + docno
                            + "\"> Fetch content for docno: " + docno + "</a><br><br>"));
                    out.close();
                } else {
                    throw new Exception();
                }
            } catch (Exception e) {
                LOG.info("trapped error fetching " + docno);
                res.setContentType("text/html");

                PrintWriter out = res.getWriter();
                out.print("<html><head><title>Invalid docno!</title><head>\n");
                out.print("<body>\n");
                out.print("<h1>Error!</h1>\n");
                out.print("<h3>Invalid docno: " + docno + "</h3>\n");
                out.print("</body></html>\n");
                out.close();
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 3) {
            System.out.println("usage: [index-file] [docno-mapping-data-files] [clue-forward-index-root]");
            //[clue-forward-index-root: /shared/ClueWeb09/collection.compressed.block/
            System.exit(-1);
        }

        String indexFile = otherArgs[0];
        String mappingFile = otherArgs[1];
        String clueIndexRoot = otherArgs[2].endsWith("/") ? otherArgs[2] : otherArgs[2] + "/";

        String cluewebForwardIndex = "";
        for (int i = 1; i < 10; i++)
            cluewebForwardIndex += clueIndexRoot + "findex.en.0" + i + ".dat" + SEPARATOR + " ";
        cluewebForwardIndex += clueIndexRoot + "findex.en.10.dat";

        LOG.info("Launching DocumentForwardIndexHttpServer");
        LOG.info(" - index file: " + indexFile);
        LOG.info(" - docno mapping data file: " + mappingFile);
        LOG.info(" - ClueWeb09 index root:" + clueIndexRoot);

        FileSystem fs = FileSystem.get(conf);

        Random rand = new Random();
        int r = rand.nextInt();

        // this tmp file as a rendezvous point
        Path tmpPath = new Path("/tmp/" + r);

        if (fs.exists(tmpPath)) {
            fs.delete(tmpPath, true);
        }

        JobConf job = new JobConf(conf, ClueWebAnchorTextForwardIndexHttpServer.class);

        job.setJobName("ForwardIndexServer:" + indexFile);

        job.set("mapred.child.java.opts", "-Xmx2048m");

        job.setNumMapTasks(1);
        job.setNumReduceTasks(0);

        job.setInputFormat(NullInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);
        job.setMapperClass(ServerMapper.class);

        job.set("IndexFile", indexFile);
        job.set("DocnoMappingDataFile", mappingFile);
        job.set("TmpPath", tmpPath.toString());
        job.set("ClueWebIndexFiles", cluewebForwardIndex);

        JobClient client = new JobClient(job);
        client.submitJob(job);

        LOG.info("Waiting for server to start up...");

        while (!fs.exists(tmpPath)) {
            Thread.sleep(50000);
            LOG.info("...");
        }

        FSDataInputStream in = fs.open(tmpPath);
        String host = in.readUTF();
        in.close();

        LOG.info("host: " + host);
        LOG.info("port: 8888");
    }
}