List of usage examples for org.apache.lucene.document StringField StringField
public StringField(String name, BytesRef value, Store stored)
From source file:MyServlet.java
private static void addDoc(IndexWriter w, String Class, String number, String time, String department) throws IOException { Document doc = new Document(); // A text field will be tokenized doc.add(new TextField("Classes", Class, Field.Store.YES)); // We use a string field for isbn because we don\'t want it tokenized doc.add(new StringField("Number", number, Field.Store.YES)); doc.add(new StringField("Time", time, Field.Store.YES)); doc.add(new StringField("Department", department, Field.Store.YES)); w.addDocument(doc);/*from w ww .java 2 s . c o m*/ }
From source file:FileIndexer.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);// ww w. j a v a 2 s . c om // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:IrqaQuery.java
License:Apache License
public static void indexDoc(String docid, String... args) throws IOException { // docid, title, contents,... Document doc = new Document(); Field pathField = new StringField("docid", docid, Field.Store.YES); doc.add(pathField);//from w w w . j a v a 2s . co m for (int i = 0; i < args.length; i += 2) { String field = args[i]; String field_text = args[i + 1]; doc.add(new TextField(field, field_text, Field.Store.NO)); // System.out.println("[doc.add]" + path + ":" + field + ":" + field_text); } System.out.println("adding " + docid); writer.addDocument(doc); }
From source file:DocIndexer.java
License:Apache License
private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException { RAMDirectory directory = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET)); config.setOpenMode(OpenMode.CREATE); config.setCommitOnClose(true);/*ww w . j ava 2 s . co m*/ try (IndexWriter iwriter = new IndexWriter(directory, config)) { for (String inputFile : inputFiles) { File file = new File(inputFile); if (file.length() == 0) { continue; } String title; try (BufferedReader titleReader = new BufferedReader( new InputStreamReader(new FileInputStream(file), "UTF-8"))) { title = titleReader.readLine(); if (title != null && title.startsWith("[[")) { // Generally the first line of the txt is the title. In a few cases the // first line is a "[[tag]]" and the second line is the title. title = titleReader.readLine(); } } Matcher matcher = SECTION_HEADER.matcher(title); if (matcher.matches()) { title = matcher.group(1); } String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt); try (FileReader reader = new FileReader(file)) { Document doc = new Document(); doc.add(new TextField(Constants.DOC_FIELD, reader)); doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES)); doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES)); iwriter.addDocument(doc); } } } return directory; }
From source file:LuceneIndexDirectoryOrFile.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from ww w .j a va2s . c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // File file = new File(fileToBeIndexed); BufferedReader br = new BufferedReader(new FileReader(file)); String title = ""; String docno = ""; String text = ""; String line = ""; boolean docStarted = false; Document doc = null; while ((line = br.readLine()) != null) { //Note that these fields are part of a TRECtext file if (line.indexOf("<DOC>") > -1) { docStarted = true; doc = new Document(); } else if (line.indexOf("</DOC>") > -1) { docStarted = false; /* Previous versions had fields with parameters doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("docno", docno, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED)); StringField -- not analyzed, in one chunk TextFiled -- analyzed */ doc.add(new StringField("docno", docno, Field.Store.YES)); doc.add(new TextField("title", title, Field.Store.YES)); doc.add(new TextField("text", text, Field.Store.YES)); writer.addDocument(doc); } if (docStarted) { int i = -1; if (((i = line.indexOf("<TITLE>")) > -1) && (line.indexOf("</TITLE>") > -1)) { title = (line.substring(i + "<TITLE>".length(), line.indexOf("</TITLE>"))); } else if ((i = line.indexOf("<TEXT>")) > -1) { text = line.substring(i + "<TEXT>".length()); } else if ((i = line.indexOf("<DOCNO>")) > -1) { docno = line.substring(i + "<DOCNO>".length(), line.indexOf("</DOCNO>")); } } } br.close(); System.out.println("adding " + file); /* // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } */ } finally { fis.close(); } } } }
From source file:IndexTaxis.java
License:Apache License
static void addOneField(Document doc, String fieldName, String rawValue) { // nocommit/*from ww w.ja v a 2 s. c o m*/ /* if (fieldName.equals("pick_up_lat")) { double value = Double.parseDouble(rawValue); doc.add(new DoublePoint(fieldName, value)); doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value))); } */ switch (fieldName) { case "vendor_id": case "cab_color": case "payment_type": case "trip_type": case "rate_code": case "store_and_fwd_flag": doc.add(new StringField(fieldName, rawValue, Field.Store.NO)); doc.add(new SortedSetDocValuesField(fieldName, new BytesRef(rawValue))); break; case "vendor_name": doc.add(new TextField(fieldName, rawValue, Field.Store.NO)); break; case "pick_up_date_time": case "drop_off_date_time": { long value = Long.parseLong(rawValue); doc.add(new LongPoint(fieldName, value)); doc.add(new SortedNumericDocValuesField(fieldName, value)); } break; case "passenger_count": { int value = Integer.parseInt(rawValue); doc.add(new IntPoint(fieldName, value)); doc.add(new SortedNumericDocValuesField(fieldName, value)); } break; case "trip_distance": case "pick_up_lat": case "pick_up_lon": case "drop_off_lat": case "drop_off_lon": case "fare_amount": case "surcharge": case "mta_tax": case "extra": case "ehail_fee": case "improvement_surcharge": case "tip_amount": case "tolls_amount": case "total_amount": { double value; try { value = Double.parseDouble(rawValue); } catch (NumberFormatException nfe) { System.out.println( "WARNING: failed to parse \"" + rawValue + "\" as double for field \"" + fieldName + "\""); return; } doc.add(new DoublePoint(fieldName, value)); doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value))); } break; default: throw new AssertionError("failed to handle field \"" + fieldName + "\""); } }
From source file:alix.lucene.Alix.java
License:Open Source License
/** * Starts a new Lucene document and gives an id. * Called by the XSL/* w w w . j a v a 2s .c o m*/ * * @param idField ID field name */ public static void docNew() { doc = new Document(); // key to delete doc.add(new StringField(FILENAME, filename, Store.YES)); }
From source file:alix.lucene.Alix.java
License:Open Source License
private static void addField(String name, String value, String options, float boost) { // do not add field for null ? if (value == null) return;/*www.j a v a 2s .com*/ if (doc == null) { System.err.println("Please call docNew() before field()!"); return; } if (name == Alix.FILENAME) { System.err.println(name + " is a reserved field name for Alix."); return; } Field field; if (options == null || "".equals(options)) { field = new StringField(name, value, Store.YES); } else if (options.contains("#")) { field = new IntField(name, Integer.parseInt(value), Field.Store.YES); } else if (options.contains(".")) { field = new FloatField(name, Float.parseFloat(value), Field.Store.YES); } else { field = new Field(name, value, fieldType(options)); } if (boost > 0) field.setBoost(boost); doc.add(field); }
From source file:antnlp.opie.indexsearch.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { InputStreamReader iReader = new InputStreamReader(Files.newInputStream(file), StandardCharsets.UTF_8); BufferedReader bufReader = new BufferedReader(iReader); String docLine = null;//from ww w .j a v a 2s . c o m while ((docLine = bufReader.readLine()) != null) { docLine = docLine.trim(); if (docLine.length() == 0) continue; String[] column = docLine.split("\\t"); System.out.println(column[0]); System.out.println(column[1]); // make a new, empty document Document doc = new Document(); // Add the id of the file as a field named "id". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field docidField = new StringField("docid", column[0], Field.Store.YES); doc.add(docidField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", column[1], Field.Store.YES)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + column[0]); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + column[0]); writer.updateDocument(new Term("docid", column[0]), doc); } } iReader.close(); bufReader.close(); }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document//from w w w. j a v a 2s . c om * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }