br.bireme.ngrams.TestIndex.java Source code

Java tutorial

Introduction

Here is the source code for br.bireme.ngrams.TestIndex.java

Source

/*=========================================================================
    
NGrams  Pan American Health Organization, 2018.
See License at: https://github.com/bireme/NGrams/blob/master/LICENSE.txt
    
  ==========================================================================*/

package br.bireme.ngrams;

import java.io.*;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import org.xml.sax.SAXException;

/**
 * Test if the duplicated document Lucene index is following the schema file.
 *
 * @author Heitor Barbieri
 * date: 20180619
 */
public class TestIndex {
    public static boolean test(final String indexDir, final String schemaFile, final String schemaEncoding)
            throws IOException, ParserConfigurationException, SAXException {

        final NGSchema schema = new NGSchema("", schemaFile, schemaEncoding);
        final FSDirectory directory = FSDirectory.open(new File(indexDir).toPath());
        final DirectoryReader ireader = DirectoryReader.open(directory);

        return test(ireader, schema);
    }

    public static boolean test(final IndexReader ireader, final NGSchema schema)
            throws IOException, ParserConfigurationException, SAXException {
        final Parameters parameters = schema.getParameters();
        final Map<String, Field> fields = parameters.getNameFields();
        boolean bad = false;

        for (int id = 0; id < ireader.maxDoc(); id++) {
            final Document doc = ireader.document(id);

            if (id % 100000 == 0)
                System.out.println("+++ " + id);
            bad = badDocument(doc, fields);
            if (bad) {
                System.out.println("BAD DOCUMENT => id: " + doc.get("id"));
                break;
            }
        }
        ireader.close();

        return !bad;
    }

    private static boolean badDocument(final Document doc, final Map<String, Field> fields) {
        boolean ret = false;

        for (Field field : fields.values()) {
            ret = !checkRequiredField(doc, field);
            if (ret)
                break;
        }
        return ret;
    }

    private static boolean checkRequiredField(final Document doc, final Field field) {
        final String reqFieldName = field.requiredField;
        final String recFieldContent = doc.get(reqFieldName);

        return (reqFieldName == null) || (reqFieldName.isEmpty())
                || ((recFieldContent != null) && (!recFieldContent.isEmpty()));
    }
}