Example usage for org.apache.lucene.document Document Document

List of usage examples for org.apache.lucene.document Document Document

Introduction

In this page you can find the example usage for org.apache.lucene.document Document Document.

Prototype

public Document() 

Source Link

Document

Constructs a new document with no fields.

Usage

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void createIndex() throws IOException {

    long t0 = System.nanoTime();

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is = Files
            .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : "")));

    IndexWriterConfig iwc = new IndexWriterConfig(null);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setMaxBufferedDocs(109630);
    //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    iwc.setRAMBufferSizeMB(256.0);/* w ww .j  av a 2 s .co  m*/
    iwc.setMergePolicy(new LogDocMergePolicy());
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    IndexWriter w = new IndexWriter(dir, iwc);

    int count = 0;
    byte[] scratch = new byte[4];
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] parts = line.split(",");
        //long id = Long.parseLong(parts[0]);
        int lat = (int) (1000000. * Double.parseDouble(parts[1]));
        //int lon = (int) (1000000. * Double.parseDouble(parts[2]));
        Document doc = new Document();
        if (USE_NF) {
            doc.add(new LegacyIntField("latnum", lat, Field.Store.NO));
            //doc.add(new LongField("lonnum", lon, Field.Store.NO));
        } else {
            doc.add(new IntPoint("lat", lat));
            //doc.add(new SortedNumericDocValuesField("lon", lon));
        }
        w.addDocument(doc);
        count++;
        if (count % 1000000 == 0) {
            System.out.println(count + "...");
        }
    }
    //w.forceMerge(1);
    w.commit();
    System.out.println(w.maxDoc() + " total docs");

    w.close();
    long t1 = System.nanoTime();
    System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index");
}

From source file:IndexTaxis.java

License:Apache License

/** Index all documents contained in one chunk */
static void indexOneChunk(String[] fields, Chunk chunk, IndexWriter w, AtomicInteger docCounter,
        AtomicLong bytesCounter) throws IOException {

    Document doc = new Document();
    byte[] bytes = chunk.bytes;
    if (bytes[bytes.length - 1] != NEWLINE) {
        throw new AssertionError();
    }//from   ww  w. j  av  a2  s .  co  m
    w.addDocuments(new Iterable<Document>() {
        @Override
        public Iterator<Document> iterator() {
            return new Iterator<Document>() {
                private int i;
                private Document nextDoc;
                private boolean nextSet;
                private int lastLineStart;
                private int chunkDocCount;

                @Override
                public boolean hasNext() {
                    if (nextSet == false) {
                        setNextDoc();
                        nextSet = true;
                    }

                    return nextDoc != null;
                }

                @Override
                public Document next() {
                    assert nextSet;
                    nextSet = false;
                    Document result = nextDoc;
                    nextDoc = null;
                    return result;
                }

                private void setNextDoc() {
                    Document doc = new Document();
                    int fieldUpto = 0;
                    int lastFieldStart = i;
                    for (; i < bytes.length; i++) {
                        byte b = bytes[i];
                        if (b == NEWLINE || b == COMMA) {
                            if (i > lastFieldStart) {
                                String s = new String(bytes, lastFieldStart, i - lastFieldStart,
                                        StandardCharsets.UTF_8);
                                addOneField(doc, fields[fieldUpto], s);
                            }
                            if (b == NEWLINE) {
                                if (fieldUpto != fields.length - 1) {
                                    throw new AssertionError("fieldUpto=" + fieldUpto + " vs fields.length-1="
                                            + (fields.length - 1));
                                }
                                chunkDocCount++;
                                this.nextDoc = doc;
                                int x = docCounter.incrementAndGet();
                                long y = bytesCounter.addAndGet((i + 1) - lastLineStart);
                                if (x % 100000 == 0) {
                                    double sec = (System.nanoTime() - startNS) / 1000000000.0;
                                    System.out.println(String.format(Locale.ROOT,
                                            "%.1f sec: %d docs; %.1f docs/sec; %.1f MB/sec", sec, x, x / sec,
                                            (y / 1024. / 1024.) / sec));
                                }
                                fieldUpto = 0;
                                i++;
                                lastLineStart = i;
                                return;
                            } else {
                                fieldUpto++;
                            }
                            lastFieldStart = i + 1;
                        }
                    }
                    // System.out.println("chunk doc count: " + chunkDocCount);
                }
            };
        }
    });
}

From source file:syslogProcess.java

License:Open Source License

void storeLine(org.w3c.dom.Document syslogDoc) {

    try {//from  ww  w  .  ja  v a  2s .  c o  m
        NodeList nList = syslogDoc.getElementsByTagName("doc");
        Node nNode = nList.item(0);
        Element eElement = (Element) nNode;

        String from = getTagValue("from", eElement);
        String facility = getTagValue("facility", eElement);
        String msg = getTagValue("msg", eElement);
        String hostname = getTagValue("hostname", eElement);
        int priority = Integer.parseInt(getTagValue("priority", eElement));
        String tag = getTagValue("tag", eElement);
        String program = getTagValue("program", eElement);
        String severity = getTagValue("severity", eElement);
        long generated = Long.parseLong(getTagValue("generated", eElement));

        Document doc = new Document();
        doc.add(new Field("from", from, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("facility", facility, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("data", msg, Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new Field("hostname", hostname, Field.Store.YES, Field.Index.ANALYZED));
        doc.add(new NumericField("priority", Field.Store.NO, true).setIntValue(priority));
        doc.add(new Field("tag", tag, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("program", program, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("severity", severity, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(generated));

        writer.addDocument(doc);

    } catch (Exception ex) {
        System.out.print("Exception: " + ex.toString() + "\n");
    }

}

From source file:TrecDocument.java

License:Apache License

/** Makes a document for a File.
  <p>//from   w  w  w.j  a va  2s.  c  o m
  The document has three fields:
  <ul>
  <li><code>path</code>--containing the pathname of the file, as a stored,
  untokenized field;
  <li><code>modified</code>--containing the last modified date of the file as
  a field as created by <a
  href="lucene.document.DateTools.html">DateTools</a>; and
  <li><code>contents</code>--containing the full contents of the file, as a
  Reader field;
  */

@SuppressWarnings("deprecation")
public static ArrayList<Document> Documents(File f) throws IOException {
    ArrayList<Document> Docs = new ArrayList<Document>();

    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    String text = "", DocNo = null;
    Document doc;
    boolean textStarted = false;
    while ((line = reader.readLine()) != null) {

        if (line.equals("<DOC>")) {
            text = "";
            DocNo = null;
            textStarted = false;
            continue;
        }
        if (line.startsWith("<DOCNO>")) {
            //DocNo = reader.readLine();

            Pattern p = Pattern.compile("<DOCNO> (.*) </DOCNO>");
            Matcher m = p.matcher(line);
            boolean isvalid = m.matches();
            if (isvalid) {
                DocNo = m.group(1);
                System.out.println(DocNo);
            }

            continue;
        }

        if (line.equals("<TEXT>")) {
            textStarted = true;
            text = "";
            continue;
        }
        if (line.equals("</TEXT>")) {
            textStarted = false;
            continue;
        }
        if (textStarted) {
            text += "\n" + line;
        }
        if (line.equals("</DOC>")) {
            doc = new Document();
            textStarted = false;

            doc.add(new Field("DocNo", DocNo, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

            Docs.add(doc);
        }
    }
    return Docs;

}

From source file:action.indexing.Fragments.java

License:Apache License

public void docBoostMethod() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),
            IndexWriter.MaxFieldLength.UNLIMITED);

    // START//from   w  w w . ja  va  2  s .  c om
    Document doc = new Document();
    String senderEmail = getSenderEmail();
    String senderName = getSenderName();
    String subject = getSubject();
    String body = getBody();
    doc.add(new Field("senderEmail", senderEmail, Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("senderName", senderName, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("subject", subject, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("body", body, Field.Store.NO, Field.Index.ANALYZED));
    String lowerDomain = getSenderDomain().toLowerCase();
    if (isImportant(lowerDomain)) {
        doc.setBoost(1.5F); //1
    } else if (isUnimportant(lowerDomain)) {
        doc.setBoost(0.1F); //2 
    }
    writer.addDocument(doc);
    // END
    writer.close();

    /*
      #1 Good domain boost factor: 1.5
      #2 Bad domain boost factor: 0.1
    */
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numberField() {
    Document doc = new Document();
    // START// w  w w.  jav a2 s .  co  m
    doc.add(new NumericField("price").setDoubleValue(19.99));
    // END
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numberTimestamp() {
    Document doc = new Document();
    // START/*from   ww w  .j a  v a 2s.  c om*/
    doc.add(new NumericField("timestamp").setLongValue(new Date().getTime()));
    // END

    // START
    doc.add(new NumericField("day").setIntValue((int) (new Date().getTime() / 24 / 3600)));
    // END

    Date date = new Date();
    // START
    Calendar cal = Calendar.getInstance();
    cal.setTime(date);
    doc.add(new NumericField("dayOfMonth").setIntValue(cal.get(Calendar.DAY_OF_MONTH)));
    // END
}

From source file:action.indexing.Fragments.java

License:Apache License

public void dateMethod() {
    Document doc = new Document();
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
            Field.Store.YES, Field.Index.NOT_ANALYZED));
}

From source file:action.indexing.Fragments.java

License:Apache License

public void numericField() throws Exception {
    Document doc = new Document();
    NumericField price = new NumericField("price");
    price.setDoubleValue(19.99);//from  www .  j a v a  2  s.  c o m
    doc.add(price);

    NumericField timestamp = new NumericField("timestamp");
    timestamp.setLongValue(new Date().getTime());
    doc.add(timestamp);

    Date b = new Date();
    NumericField birthday = new NumericField("birthday");
    String v = DateTools.dateToString(b, DateTools.Resolution.DAY);
    birthday.setIntValue(Integer.parseInt(v));
    doc.add(birthday);
}

From source file:action.indexing.Fragments.java

License:Apache License

public void indexAuthors() throws Exception {
    String[] authors = new String[] { "lisa", "tom" };
    // START/*from w  w  w  .j a v  a  2  s.c  o m*/
    Document doc = new Document();
    for (String author : authors) {
        doc.add(new Field("author", author, Field.Store.YES, Field.Index.ANALYZED));
    }
    // END
}