TrecDocument.java Source code

Java tutorial

Introduction

Here is the source code for TrecDocument.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.*;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

/** A utility for making Lucene Documents from a File. */

public class TrecDocument {
    /** Makes a document for a File.
      <p>
      The document has three fields:
      <ul>
      <li><code>path</code>--containing the pathname of the file, as a stored,
      untokenized field;
      <li><code>modified</code>--containing the last modified date of the file as
      a field as created by <a
      href="lucene.document.DateTools.html">DateTools</a>; and
      <li><code>contents</code>--containing the full contents of the file, as a
      Reader field;
      */

    @SuppressWarnings("deprecation")
    public static ArrayList<Document> Documents(File f) throws IOException {
        ArrayList<Document> Docs = new ArrayList<Document>();

        BufferedReader reader = new BufferedReader(new FileReader(f));
        String line = null;
        String text = "", DocNo = null;
        Document doc;
        boolean textStarted = false;
        while ((line = reader.readLine()) != null) {

            if (line.equals("<DOC>")) {
                text = "";
                DocNo = null;
                textStarted = false;
                continue;
            }
            if (line.startsWith("<DOCNO>")) {
                //DocNo = reader.readLine();

                Pattern p = Pattern.compile("<DOCNO> (.*) </DOCNO>");
                Matcher m = p.matcher(line);
                boolean isvalid = m.matches();
                if (isvalid) {
                    DocNo = m.group(1);
                    System.out.println(DocNo);
                }

                continue;
            }

            if (line.equals("<TEXT>")) {
                textStarted = true;
                text = "";
                continue;
            }
            if (line.equals("</TEXT>")) {
                textStarted = false;
                continue;
            }
            if (textStarted) {
                text += "\n" + line;
            }
            if (line.equals("</DOC>")) {
                doc = new Document();
                textStarted = false;

                doc.add(new Field("DocNo", DocNo, Field.Store.YES, Field.Index.NO));
                doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

                Docs.add(doc);
            }
        }
        return Docs;

    }

    private TrecDocument() {
    }
}