Example usage for org.apache.lucene.benchmark.byTask.feeds DocData getProps

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData getProps

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocData getProps.

Prototype

public Properties getProps() 

Source Link

Usage

From source file:com.tamingtext.qa.WikipediaWexIndexer.java

License:Apache License

public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaWEX != null && wikipediaWEX.isFile()) {
        WexWikiContentSource contentSource = new WexWikiContentSource();
        Properties properties = new Properties();
        // fileName = config.get("docs.file", null);
        String filePath = wikipediaWEX.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//w  ww  .  j a  v  a2 s  .  com
        // docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(docData.getID()));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                String[] categories = docData.getProps().getProperty("category").split(";;");

                for (String c : categories) {
                    sDoc.addField("category", c);
                }

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaWEX);
    }
    return result;
}

From source file:info.boytsov.lucene.parsers.ClueWeb09ContentSource.java

License:Open Source License

@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    WarcRecord CurrRec = null;//from  w  w w  .  j a v  a 2s  .  c  om

    // protect reading from the TREC files by multiple threads. The rest of the
    // method, i.e., parsing the content and returning the DocData can run unprotected.
    synchronized (lock) {
        if (reader == null) {
            openNextFile();
        }

        do {
            CurrRec = WarcRecord.readNextWarcRecord(reader);
            /*
             *  We need to skip special auxiliary entries, e.g., in the
             *  beginning of the file.
             */

        } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response"));

        if (CurrRec == null) {
            openNextFile();
            return getNextDocData(docData);
        }
    }

    Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date"));
    String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI");

    // This code segment relies on HtmlParser being thread safe. When we get 
    // here, everything else is already private to that thread, so we're safe.
    if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
        String Response = CurrRec.getContentUTF8();

        int EndOfHead = Response.indexOf("\n\n");

        if (EndOfHead >= 0) {
            String html = Response.substring(EndOfHead + 2);

            Properties props = new Properties();

            docData = htmlParser.parse(docData, url, date, new StringReader(html), this);
            // This should be done after parse(), b/c parse() resets properties
            docData.getProps().put("url", url);
        } else {
            /*
             *  TODO: @leo What do we do here exactly? 
             *  The interface doesn't allow us to signal that an entry should be skipped. 
             */
            System.err.println("Cannot extract HTML in URI: " + url);
        }
    } else {
        /*
         *  TODO: @leo What do we do here exactly? 
         *  The interface doesn't allow us to signal that an entry should be skipped. 
         */
        System.err.println("Ignoring schema in URI: " + url);
    }

    addItem();

    return docData;
}

From source file:info.boytsov.lucene.parsers.TrecGov2Parser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    // skip some of the non-html text, optionally set date
    Date date = null;/*from  w  w  w  . jav a 2s  . c o m*/
    int start = 0;
    final int h1 = docBuf.indexOf(DOCHDR);
    if (h1 >= 0) {
        final int hStart2dLine = h1 + DOCHDR.length() + 1;
        final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);

        if (hEnd2dLine >= 0) {
            String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim();

            if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
                final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
                final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null) {
                    date = trecSrc.parseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.length();

                final String html = docBuf.substring(start);
                docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
                // This should be done after parse(), b/c parse() resets properties
                docData.getProps().put("url", url);
                return docData;
            } else {
                System.err.println("Ignoring schema in URI: " + url);
            }
        } else {
            System.err.println("Invalid header: " + docBuf.toString());
        }
    }

    /*
     *  TODO: @leo What do we do here exactly? 
     *  The interface doesn't allow us to signal that an entry should be skipped. 
     */

    return docData;
}

From source file:parsers.ClueWebContentSource.java

License:Open Source License

@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    WarcRecord CurrRec = null;//  ww w .  j  av a  2 s. co m

    // protect reading from the TREC files by multiple threads. The rest of the
    // method, i.e., parsing the content and returning the DocData can run unprotected.
    synchronized (lock) {
        if (reader == null) {
            openNextFile();
        }

        do {
            CurrRec = WarcRecord.readNextWarcRecord(reader);
            /*
             *  We need to skip special auxiliary entries, e.g., in the
             *  beginning of the file.
             */

        } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response"));

        if (CurrRec == null) {
            openNextFile();
            return getNextDocData(docData);
        }
    }

    Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date"));
    String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI");
    String trecId = CurrRec.getHeaderMetadataItem("WARC-TREC-ID");

    if (null == trecId)
        throw new RuntimeException("No WARC-TREC-ID field for url: '" + url + "'");

    // This code segment relies on HtmlParser being thread safe. When we get 
    // here, everything else is already private to that thread, so we're safe.
    if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
        // In ClueWeb09, the HTTP response was incorrectly terminated by \n\n instead of \r\n\r\n
        // as requested by the standard 
        // So, to make all ClueWeb12 documents parseable with the old approach, we replace the first
        // \r\n\r\n with \n\n and will proceed as if we have ClueWeb09
        String Response = CurrRec.getContentUTF8().replaceFirst("\r\n\r\n", "\n\n");

        int EndOfHead = Response.indexOf("\n\n");

        if (EndOfHead >= 0) {
            String html = Response.substring(EndOfHead + 2);

            //System.out.println(html);
            //System.out.println("====================");

            docData = htmlParser.parse(docData, url, date, new StringReader(html), this);
            // This should be done after parse(), b/c parse() resets properties
            docData.getProps().put("url", url);
            docData.setName(trecId);

        } else {
            /*
             *  TODO: @leo What do we do here exactly? 
             *  The interface doesn't allow us to signal that an entry should be skipped. 
             */
            System.err.println("Cannot extract HTML in URI: " + url);
        }
    } else {
        /*
         *  TODO: @leo What do we do here exactly? 
         *  The interface doesn't allow us to signal that an entry should be skipped. 
         */
        System.err.println("Ignoring schema in URI: " + url);
    }

    addItem();

    return docData;
}

From source file:parsers.TrecGov2Parser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    // skip some of the non-html text, optionally set date
    Date date = null;//w  ww . j  a  va2s .  c om
    int start = 0;
    final int h1 = docBuf.indexOf(DOCHDR);
    if (h1 >= 0) {
        final int hStart2dLine = h1 + DOCHDR.length() + 1;
        final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);

        if (hEnd2dLine >= 0) {
            String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim();

            if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
                final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
                final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null) {
                    date = trecSrc.parseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.length();

                final String html = docBuf.substring(start);
                docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
                // This should be done after parse(), b/c parse() resets properties
                docData.getProps().put("url", url);
                docData.setName(name);
                return docData;
            } else {
                System.err.println("Ignoring schema in URI: " + url);
            }
        } else {
            throw new RuntimeException("Invalid header: " + docBuf.toString());
        }
    }

    /*
     *  TODO: @leo What do we do here exactly? 
     *  The interface doesn't allow us to signal that an entry should be skipped. 
     */

    return docData;
}