Example usage for org.apache.lucene.document Document Document

Introduction

In this page you can find the example usage for org.apache.lucene.document Document Document.

Prototype

public Document()

Source Link

Document

Constructs a new document with no fields.

Usage

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void createIndex() throws IOException {

    long t0 = System.nanoTime();

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is = Files
            .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : "")));

    IndexWriterConfig iwc = new IndexWriterConfig(null);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setMaxBufferedDocs(109630);
    //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    iwc.setRAMBufferSizeMB(256.0);/* w ww .j  av a 2 s .co  m*/
    iwc.setMergePolicy(new LogDocMergePolicy());
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    IndexWriter w = new IndexWriter(dir, iwc);

    int count = 0;
    byte[] scratch = new byte[4];
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] parts = line.split(",");
        //long id = Long.parseLong(parts[0]);
        int lat = (int) (1000000. * Double.parseDouble(parts[1]));
        //int lon = (int) (1000000. * Double.parseDouble(parts[2]));
        Document doc = new Document();
        if (USE_NF) {
            doc.add(new LegacyIntField("latnum", lat, Field.Store.NO));
            //doc.add(new LongField("lonnum", lon, Field.Store.NO));
        } else {
            doc.add(new IntPoint("lat", lat));
            //doc.add(new SortedNumericDocValuesField("lon", lon));
        }
        w.addDocument(doc);
        count++;
        if (count % 1000000 == 0) {
            System.out.println(count + "...");
        }
    }
    //w.forceMerge(1);
    w.commit();
    System.out.println(w.maxDoc() + " total docs");

    w.close();
    long t1 = System.nanoTime();
    System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index");
}

From source file:IndexTaxis.java

License:Apache License

/** Index all documents contained in one chunk */
static void indexOneChunk(String[] fields, Chunk chunk, IndexWriter w, AtomicInteger docCounter,
        AtomicLong bytesCounter) throws IOException {

    Document doc = new Document();
    byte[] bytes = chunk.bytes;
    if (bytes[bytes.length - 1] != NEWLINE) {
        throw new AssertionError();
    }//from   ww  w. j  av  a2  s .  co  m
    w.addDocuments(new Iterable<Document>() {
        @Override
        public Iterator<Document> iterator() {
            return new Iterator<Document>() {
                private int i;
                private Document nextDoc;
                private boolean nextSet;
                private int lastLineStart;
                private int chunkDocCount;

                @Override
                public boolean hasNext() {
                    if (nextSet == false) {
                        setNextDoc();
                        nextSet = true;
                    }

                    return nextDoc != null;
                }

                @Override
                public Document next() {
                    assert nextSet;
                    nextSet = false;
                    Document result = nextDoc;
                    nextDoc = null;
                    return result;
                }

                private void setNextDoc() {
                    Document doc = new Document();
                    int fieldUpto = 0;
                    int lastFieldStart = i;
                    for (; i < bytes.length; i++) {
                        byte b = bytes[i];
                        if (b == NEWLINE || b == COMMA) {
                            if (i > lastFieldStart) {
                                String s = new String(bytes, lastFieldStart, i - lastFieldStart,
                                        StandardCharsets.UTF_8);
                                addOneField(doc, fields[fieldUpto], s);
                            }
                            if (b == NEWLINE) {
                                if (fieldUpto != fields.length - 1) {
                                    throw new AssertionError("fieldUpto=" + fieldUpto + " vs fields.length-1="
                                            + (fields.length - 1));
                                }
                                chunkDocCount++;
                                this.nextDoc = doc;
                                int x = docCounter.incrementAndGet();
                                long y = bytesCounter.addAndGet((i + 1) - lastLineStart);
                                if (x % 100000 == 0) {
                                    double sec = (System.nanoTime() - startNS) / 1000000000.0;
                                    System.out.println(String.format(Locale.ROOT,
                                            "%.1f sec: %d docs; %.1f docs/sec; %.1f MB/sec", sec, x, x / sec,
                                            (y / 1024. / 1024.) / sec));
                                }
                                fieldUpto = 0;
                                i++;
                                lastLineStart = i;
                                return;
                            } else {
                                fieldUpto++;
                            }
                            lastFieldStart = i + 1;
                        }
                    }
                    // System.out.println("chunk doc count: " + chunkDocCount);
                }
            };
        }
    });
}

From source file:syslogProcess.java

License:Open Source License

void storeLine(org.w3c.dom.Document syslogDoc) {

    try {//from  ww  w  .  ja  v a  2s .  c o  m
        NodeList nList = syslogDoc.getElementsByTagName("doc");
        Node nNode = nList.item(0);
        Element eElement = (Element) nNode;

        String from = getTagValue("from", eElement);
        String facility = getTagValue("facility", eElement);
        String msg = getTagValue("msg", eElement);
        String hostname = getTagValue("hostname", eElement);
        int priority = Integer.parseInt(getTagValue("priority", eElement));
        String tag = getTagValue("tag", eElement);
        String program = getTagValue("program", eElement);
        String severity = getTagValue("severity", eElement);
        long generated = Long.parseLong(getTagValue("generated", eElement));

        Document doc = new Document();
        doc.add(new Field("from", from, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("facility", facility, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("data", msg, Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field("hostname", hostname, Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new NumericField("priority", Field.Store.NO, true).setIntValue(priority));
        doc.add(new Field("tag", tag, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("program", program, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("severity", severity, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(generated));

        writer.addDocument(doc);

    } catch (Exception ex) {
        System.out.print("Exception: " + ex.toString() + "\n");
    }

}

From source file:TrecDocument.java

License:Apache License

/** Makes a document for a File.
  <p>//from   w  w  w.j  a va  2s.  c  o m
  The document has three fields:
  <ul>
  <li><code>path</code>--containing the pathname of the file, as a stored,
  untokenized field;
  <li><code>modified</code>--containing the last modified date of the file as
  a field as created by <a
  href="lucene.document.DateTools.html">DateTools</a>; and
  <li><code>contents</code>--containing the full contents of the file, as a
  Reader field;
  */

@SuppressWarnings("deprecation")
public static ArrayList<Document> Documents(File f) throws IOException {
    ArrayList<Document> Docs = new ArrayList<Document>();

    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    String text = "", DocNo = null;
    Document doc;
    boolean textStarted = false;
    while ((line = reader.readLine()) != null) {

        if (line.equals("<DOC>")) {
            text = "";
            DocNo = null;
            textStarted = false;
            continue;
        }
        if (line.startsWith("<DOCNO>")) {
            //DocNo = reader.readLine();

            Pattern p = Pattern.compile("<DOCNO> (.*) </DOCNO>");
            Matcher m = p.matcher(line);
            boolean isvalid = m.matches();
            if (isvalid) {
                DocNo = m.group(1);
                System.out.println(DocNo);
            }

            continue;
        }

        if (line.equals("<TEXT>")) {
            textStarted = true;
            text = "";
            continue;
        }
        if (line.equals("</TEXT>")) {
            textStarted = false;
            continue;
        }
        if (textStarted) {
            text += "\n" + line;
        }
        if (line.equals("</DOC>")) {
            doc = new Document();
            textStarted = false;

            doc.add(new Field("DocNo", DocNo, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

            Docs.add(doc);
        }
    }
    return Docs;

}

From source file:action.indexing.Fragments.java

License:Apache License

public void docBoostMethod() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),
            IndexWriter.MaxFieldLength.UNLIMITED);

    // START//from   w  w w . ja  va  2  s .  c om
    Document doc = new Document();
    String senderEmail = getSenderEmail();
    String senderName = getSenderName();
    String subject = getSubject();
    String body = getBody();
    doc.add(new Field("senderEmail", senderEmail, Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("senderName", senderName, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("subject", subject, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("body", body, Field.Store.NO, Field.Index.ANALYZED));
    String lowerDomain = getSenderDomain().toLowerCase();
    if (isImportant(lowerDomain)) {
        doc.setBoost(1.5F); //1
    } else if (isUnimportant(lowerDomain)) {
        doc.setBoost(0.1F); //2 
    }
    writer.addDocument(doc);
    // END
    writer.close();

    /*
      #1 Good domain boost factor: 1.5
      #2 Bad domain boost factor: 0.1
    */
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numberField() {
    Document doc = new Document();
    // START// w  w w.  jav a2 s .  co  m
    doc.add(new NumericField("price").setDoubleValue(19.99));
    // END
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numberTimestamp() {
    Document doc = new Document();
    // START/*from   ww w  .j a  v a 2s.  c om*/
    doc.add(new NumericField("timestamp").setLongValue(new Date().getTime()));
    // END

    // START
    doc.add(new NumericField("day").setIntValue((int) (new Date().getTime() / 24 / 3600)));
    // END

    Date date = new Date();
    // START
    Calendar cal = Calendar.getInstance();
    cal.setTime(date);
    doc.add(new NumericField("dayOfMonth").setIntValue(cal.get(Calendar.DAY_OF_MONTH)));
    // END
}

From source file:action.indexing.Fragments.java

License:Apache License

public void dateMethod() {
    Document doc = new Document();
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
            Field.Store.YES, Field.Index.NOT_ANALYZED));
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numericField() throws Exception {
    Document doc = new Document();
    NumericField price = new NumericField("price");
    price.setDoubleValue(19.99);//from  www .  j a v a  2  s.  c o m
    doc.add(price);

    NumericField timestamp = new NumericField("timestamp");
    timestamp.setLongValue(new Date().getTime());
    doc.add(timestamp);

    Date b = new Date();
    NumericField birthday = new NumericField("birthday");
    String v = DateTools.dateToString(b, DateTools.Resolution.DAY);
    birthday.setIntValue(Integer.parseInt(v));
    doc.add(birthday);
}

From source file:action.indexing.Fragments.java

License:Apache License

public void indexAuthors() throws Exception {
    String[] authors = new String[] { "lisa", "tom" };
    // START/*from w  w  w  .j a v  a  2  s.c  o m*/
    Document doc = new Document();
    for (String author : authors) {
        doc.add(new Field("author", author, Field.Store.YES, Field.Index.ANALYZED));
    }
    // END
}