List of usage examples for org.apache.lucene.document Document Document
public Document()
From source file:IndexAndSearchOpenStreetMaps1D.java
License:Apache License
private static void createIndex() throws IOException { long t0 = System.nanoTime(); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is = Files .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt")); BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : ""))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //iwc.setMaxBufferedDocs(109630); //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwc.setRAMBufferSizeMB(256.0);/* w ww .j av a 2 s .co m*/ iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); int count = 0; byte[] scratch = new byte[4]; while (true) { String line = reader.readLine(); if (line == null) { break; } String[] parts = line.split(","); //long id = Long.parseLong(parts[0]); int lat = (int) (1000000. * Double.parseDouble(parts[1])); //int lon = (int) (1000000. * Double.parseDouble(parts[2])); Document doc = new Document(); if (USE_NF) { doc.add(new LegacyIntField("latnum", lat, Field.Store.NO)); //doc.add(new LongField("lonnum", lon, Field.Store.NO)); } else { doc.add(new IntPoint("lat", lat)); //doc.add(new SortedNumericDocValuesField("lon", lon)); } w.addDocument(doc); count++; if (count % 1000000 == 0) { System.out.println(count + "..."); } } //w.forceMerge(1); w.commit(); System.out.println(w.maxDoc() + " total docs"); w.close(); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index"); }
From source file:IndexTaxis.java
License:Apache License
/** Index all documents contained in one chunk */ static void indexOneChunk(String[] fields, Chunk chunk, IndexWriter w, AtomicInteger docCounter, AtomicLong bytesCounter) throws IOException { Document doc = new Document(); byte[] bytes = chunk.bytes; if (bytes[bytes.length - 1] != NEWLINE) { throw new AssertionError(); }//from ww w. j av a2 s . co m w.addDocuments(new Iterable<Document>() { @Override public Iterator<Document> iterator() { return new Iterator<Document>() { private int i; private Document nextDoc; private boolean nextSet; private int lastLineStart; private int chunkDocCount; @Override public boolean hasNext() { if (nextSet == false) { setNextDoc(); nextSet = true; } return nextDoc != null; } @Override public Document next() { assert nextSet; nextSet = false; Document result = nextDoc; nextDoc = null; return result; } private void setNextDoc() { Document doc = new Document(); int fieldUpto = 0; int lastFieldStart = i; for (; i < bytes.length; i++) { byte b = bytes[i]; if (b == NEWLINE || b == COMMA) { if (i > lastFieldStart) { String s = new String(bytes, lastFieldStart, i - lastFieldStart, StandardCharsets.UTF_8); addOneField(doc, fields[fieldUpto], s); } if (b == NEWLINE) { if (fieldUpto != fields.length - 1) { throw new AssertionError("fieldUpto=" + fieldUpto + " vs fields.length-1=" + (fields.length - 1)); } chunkDocCount++; this.nextDoc = doc; int x = docCounter.incrementAndGet(); long y = bytesCounter.addAndGet((i + 1) - lastLineStart); if (x % 100000 == 0) { double sec = (System.nanoTime() - startNS) / 1000000000.0; System.out.println(String.format(Locale.ROOT, "%.1f sec: %d docs; %.1f docs/sec; %.1f MB/sec", sec, x, x / sec, (y / 1024. / 1024.) / sec)); } fieldUpto = 0; i++; lastLineStart = i; return; } else { fieldUpto++; } lastFieldStart = i + 1; } } // System.out.println("chunk doc count: " + chunkDocCount); } }; } }); }
From source file:syslogProcess.java
License:Open Source License
void storeLine(org.w3c.dom.Document syslogDoc) { try {//from ww w . ja v a 2s . c o m NodeList nList = syslogDoc.getElementsByTagName("doc"); Node nNode = nList.item(0); Element eElement = (Element) nNode; String from = getTagValue("from", eElement); String facility = getTagValue("facility", eElement); String msg = getTagValue("msg", eElement); String hostname = getTagValue("hostname", eElement); int priority = Integer.parseInt(getTagValue("priority", eElement)); String tag = getTagValue("tag", eElement); String program = getTagValue("program", eElement); String severity = getTagValue("severity", eElement); long generated = Long.parseLong(getTagValue("generated", eElement)); Document doc = new Document(); doc.add(new Field("from", from, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("facility", facility, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("data", msg, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("hostname", hostname, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new NumericField("priority", Field.Store.NO, true).setIntValue(priority)); doc.add(new Field("tag", tag, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("program", program, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("severity", severity, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(generated)); writer.addDocument(doc); } catch (Exception ex) { System.out.print("Exception: " + ex.toString() + "\n"); } }
From source file:TrecDocument.java
License:Apache License
/** Makes a document for a File. <p>//from w w w.j a va 2s. c o m The document has three fields: <ul> <li><code>path</code>--containing the pathname of the file, as a stored, untokenized field; <li><code>modified</code>--containing the last modified date of the file as a field as created by <a href="lucene.document.DateTools.html">DateTools</a>; and <li><code>contents</code>--containing the full contents of the file, as a Reader field; */ @SuppressWarnings("deprecation") public static ArrayList<Document> Documents(File f) throws IOException { ArrayList<Document> Docs = new ArrayList<Document>(); BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; String text = "", DocNo = null; Document doc; boolean textStarted = false; while ((line = reader.readLine()) != null) { if (line.equals("<DOC>")) { text = ""; DocNo = null; textStarted = false; continue; } if (line.startsWith("<DOCNO>")) { //DocNo = reader.readLine(); Pattern p = Pattern.compile("<DOCNO> (.*) </DOCNO>"); Matcher m = p.matcher(line); boolean isvalid = m.matches(); if (isvalid) { DocNo = m.group(1); System.out.println(DocNo); } continue; } if (line.equals("<TEXT>")) { textStarted = true; text = ""; continue; } if (line.equals("</TEXT>")) { textStarted = false; continue; } if (textStarted) { text += "\n" + line; } if (line.equals("</DOC>")) { doc = new Document(); textStarted = false; doc.add(new Field("DocNo", DocNo, Field.Store.YES, Field.Index.NO)); doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); Docs.add(doc); } } return Docs; }
From source file:action.indexing.Fragments.java
License:Apache License
public void docBoostMethod() throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.UNLIMITED); // START//from w w w . ja va 2 s . c om Document doc = new Document(); String senderEmail = getSenderEmail(); String senderName = getSenderName(); String subject = getSubject(); String body = getBody(); doc.add(new Field("senderEmail", senderEmail, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("senderName", senderName, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("subject", subject, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("body", body, Field.Store.NO, Field.Index.ANALYZED)); String lowerDomain = getSenderDomain().toLowerCase(); if (isImportant(lowerDomain)) { doc.setBoost(1.5F); //1 } else if (isUnimportant(lowerDomain)) { doc.setBoost(0.1F); //2 } writer.addDocument(doc); // END writer.close(); /* #1 Good domain boost factor: 1.5 #2 Bad domain boost factor: 0.1 */ }
From source file:action.indexing.Fragments.java
License:Apache License
public void numberField() { Document doc = new Document(); // START// w w w. jav a2 s . co m doc.add(new NumericField("price").setDoubleValue(19.99)); // END }
From source file:action.indexing.Fragments.java
License:Apache License
public void numberTimestamp() { Document doc = new Document(); // START/*from ww w .j a v a 2s. c om*/ doc.add(new NumericField("timestamp").setLongValue(new Date().getTime())); // END // START doc.add(new NumericField("day").setIntValue((int) (new Date().getTime() / 24 / 3600))); // END Date date = new Date(); // START Calendar cal = Calendar.getInstance(); cal.setTime(date); doc.add(new NumericField("dayOfMonth").setIntValue(cal.get(Calendar.DAY_OF_MONTH))); // END }
From source file:action.indexing.Fragments.java
License:Apache License
public void dateMethod() { Document doc = new Document(); doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); }
From source file:action.indexing.Fragments.java
License:Apache License
public void numericField() throws Exception { Document doc = new Document(); NumericField price = new NumericField("price"); price.setDoubleValue(19.99);//from www . j a v a 2 s. c o m doc.add(price); NumericField timestamp = new NumericField("timestamp"); timestamp.setLongValue(new Date().getTime()); doc.add(timestamp); Date b = new Date(); NumericField birthday = new NumericField("birthday"); String v = DateTools.dateToString(b, DateTools.Resolution.DAY); birthday.setIntValue(Integer.parseInt(v)); doc.add(birthday); }
From source file:action.indexing.Fragments.java
License:Apache License
public void indexAuthors() throws Exception { String[] authors = new String[] { "lisa", "tom" }; // START/*from w w w .j a v a 2 s.c o m*/ Document doc = new Document(); for (String author : authors) { doc.add(new Field("author", author, Field.Store.YES, Field.Index.ANALYZED)); } // END }