Example usage for org.apache.lucene.benchmark.byTask.feeds DocData setName

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData setName

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocData setName.

Prototype

public void setName(String name) 

Source Link

Usage

From source file:com.grantingersoll.intell.index.EnwikiContentSource.java

License:Apache License

@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String[] tuple = parser.next();
    docData.clear();//w  ww.j  a  v  a2  s.  co  m
    docData.setName(tuple[ID]);
    docData.setBody(tuple[BODY]);
    docData.setDate(tuple[DATE]);
    docData.setTitle(tuple[TITLE]);
    return docData;
}

From source file:info.boytsov.lucene.parsers.DemoHTMLParser.java

License:Apache License

public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc)
        throws IOException, SAXException {
    String bodyText = "";
    String title = "";
    try {/*from ww w  .j a  va 2  s.  c om*/
        Parser p = new Parser(source);

        // properties 
        final Properties docProps = p.metaTags;
        String dateStr = docProps.getProperty("date");
        if (dateStr != null) {
            final Date newDate = trecSrc.parseDate(dateStr);
            if (newDate != null) {
                date = newDate;
            }
        }

        for (Entry<Object, Object> entry : docProps.entrySet()) {
            bodyText = bodyText + " " + entry.getKey() + " " + entry.getValue();
        }

        title = p.title;
        bodyText = title + " " + bodyText + " " + p.body;
    } catch (Exception e) {
        System.err.println("Parsing error: " + e.getMessage());
    }

    docData.clear();
    docData.setName(name);
    docData.setTitle(title);
    docData.setBody(bodyText);
    docData.setProps(new Properties());
    docData.setDate(date);
    return docData;
}

From source file:info.boytsov.lucene.parsers.EnwikiContentSource.java

License:Apache License

@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String[] tuple = parser.next();
    docData.clear();/*from  ww w . j  a va2s .  c  o  m*/
    docData.setName(tuple[ID]);
    docData.setBody(tuple[TITLE] + " " + tuple[BODY]);
    docData.setDate(tuple[DATE]);
    docData.setTitle(tuple[TITLE]);
    /*
     *  TODO: @leo This is not a real URL, maybe we will need a real URL some day.
     *             This should be fine for sorting purposes, though. If the input
     *             is unsorted and we want to produce sorted document ids,
     *             this is just fine.
     */
    Properties props = new Properties();
    props.put("url", tuple[TITLE]);
    docData.setProps(props);
    return docData;
}

From source file:info.boytsov.lucene.parsers.LeoHTMLParser.java

License:Open Source License

public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc)
        throws IOException {
    String title = "";
    String bodyText = "";

    String baseHref = "http://fake-domain.com";
    String encoding = "utf8";

    /*//from   www . j a va  2s .c o  m
     * 
     * This is clearly not the most efficient way to parse,
     * but it is much more stable.
     * 
     */
    StringWriter writer = new StringWriter();
    BufferedReader br = new BufferedReader(source.getCharacterStream());

    String line;
    while (null != (line = br.readLine())) {
        writer.append(line);
    }
    br.close();

    String html = writer.toString();

    try {
        Parser HtmlParser = Parser.createParser(html, encoding);

        LeoCleanerUtil res = new LeoCleanerUtil(baseHref);
        HtmlParser.visitAllNodesWith(res);

        title = res.GetTitleText();

        bodyText = title + " " + res.GetDescriptionText() + " " + res.GetKeywordText() + " "
                + res.GetBodyText();

    } catch (ParserException e) {
        System.err.println(" Parser exception: " + e + " trying simple conversion");
        // Plan B!!!
        Pair<String, String> sres = LeoCleanerUtil.SimpleProc(html);

        title = sres.getFirst();
        bodyText = title + " " + sres.getSecond();
    }

    docData.clear();
    docData.setName(name);
    docData.setTitle(title);
    docData.setBody(bodyText);
    docData.setProps(new Properties());
    docData.setDate(date);

    return docData;
}

From source file:it.unipd.dei.ims.lucene.clef.parser.ClefDocParser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    int mark = 0; // that much is skipped
    docData.clear();/*  w  ww  .  j  a  va  2 s . co m*/
    docData.setName(name);
    docData.setBody(stripTags(docBuf, mark).toString());
    return docData;
}

From source file:parsers.ClueWebContentSource.java

License:Open Source License

@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    WarcRecord CurrRec = null;// w ww.j ava  2  s . c o m

    // protect reading from the TREC files by multiple threads. The rest of the
    // method, i.e., parsing the content and returning the DocData can run unprotected.
    synchronized (lock) {
        if (reader == null) {
            openNextFile();
        }

        do {
            CurrRec = WarcRecord.readNextWarcRecord(reader);
            /*
             *  We need to skip special auxiliary entries, e.g., in the
             *  beginning of the file.
             */

        } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response"));

        if (CurrRec == null) {
            openNextFile();
            return getNextDocData(docData);
        }
    }

    Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date"));
    String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI");
    String trecId = CurrRec.getHeaderMetadataItem("WARC-TREC-ID");

    if (null == trecId)
        throw new RuntimeException("No WARC-TREC-ID field for url: '" + url + "'");

    // This code segment relies on HtmlParser being thread safe. When we get 
    // here, everything else is already private to that thread, so we're safe.
    if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
        // In ClueWeb09, the HTTP response was incorrectly terminated by \n\n instead of \r\n\r\n
        // as requested by the standard 
        // So, to make all ClueWeb12 documents parseable with the old approach, we replace the first
        // \r\n\r\n with \n\n and will proceed as if we have ClueWeb09
        String Response = CurrRec.getContentUTF8().replaceFirst("\r\n\r\n", "\n\n");

        int EndOfHead = Response.indexOf("\n\n");

        if (EndOfHead >= 0) {
            String html = Response.substring(EndOfHead + 2);

            //System.out.println(html);
            //System.out.println("====================");

            docData = htmlParser.parse(docData, url, date, new StringReader(html), this);
            // This should be done after parse(), b/c parse() resets properties
            docData.getProps().put("url", url);
            docData.setName(trecId);

        } else {
            /*
             *  TODO: @leo What do we do here exactly? 
             *  The interface doesn't allow us to signal that an entry should be skipped. 
             */
            System.err.println("Cannot extract HTML in URI: " + url);
        }
    } else {
        /*
         *  TODO: @leo What do we do here exactly? 
         *  The interface doesn't allow us to signal that an entry should be skipped. 
         */
        System.err.println("Ignoring schema in URI: " + url);
    }

    addItem();

    return docData;
}

From source file:parsers.TrecGov2Parser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    // skip some of the non-html text, optionally set date
    Date date = null;// w  w w  .j  a  v a  2  s. c o m
    int start = 0;
    final int h1 = docBuf.indexOf(DOCHDR);
    if (h1 >= 0) {
        final int hStart2dLine = h1 + DOCHDR.length() + 1;
        final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);

        if (hEnd2dLine >= 0) {
            String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim();

            if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
                final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
                final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null) {
                    date = trecSrc.parseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.length();

                final String html = docBuf.substring(start);
                docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
                // This should be done after parse(), b/c parse() resets properties
                docData.getProps().put("url", url);
                docData.setName(name);
                return docData;
            } else {
                System.err.println("Ignoring schema in URI: " + url);
            }
        } else {
            throw new RuntimeException("Invalid header: " + docBuf.toString());
        }
    }

    /*
     *  TODO: @leo What do we do here exactly? 
     *  The interface doesn't allow us to signal that an entry should be skipped. 
     */

    return docData;
}