Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.*; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; /** A utility for making Lucene Documents from a File. */ public class TrecDocument { /** Makes a document for a File. <p> The document has three fields: <ul> <li><code>path</code>--containing the pathname of the file, as a stored, untokenized field; <li><code>modified</code>--containing the last modified date of the file as a field as created by <a href="lucene.document.DateTools.html">DateTools</a>; and <li><code>contents</code>--containing the full contents of the file, as a Reader field; */ @SuppressWarnings("deprecation") public static ArrayList<Document> Documents(File f) throws IOException { ArrayList<Document> Docs = new ArrayList<Document>(); BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; String text = "", DocNo = null; Document doc; boolean textStarted = false; while ((line = reader.readLine()) != null) { if (line.equals("<DOC>")) { text = ""; DocNo = null; textStarted = false; continue; } if (line.startsWith("<DOCNO>")) { //DocNo = reader.readLine(); Pattern p = Pattern.compile("<DOCNO> (.*) </DOCNO>"); Matcher m = p.matcher(line); boolean isvalid = m.matches(); if (isvalid) { DocNo = m.group(1); System.out.println(DocNo); } continue; } if (line.equals("<TEXT>")) { textStarted = true; text = ""; continue; } if (line.equals("</TEXT>")) { textStarted = false; continue; } if (textStarted) { text += "\n" + line; } if (line.equals("</DOC>")) { doc = new Document(); textStarted = false; doc.add(new Field("DocNo", DocNo, Field.Store.YES, Field.Index.NO)); doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); Docs.add(doc); } } return Docs; } private TrecDocument() { } }