Example usage for org.apache.lucene.facet.taxonomy TaxonomyWriter commit

List of usage examples for org.apache.lucene.facet.taxonomy TaxonomyWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.facet.taxonomy TaxonomyWriter commit.

Prototype

public long commit() throws IOException;

Source Link

Document

The second phase of a 2-phase commit.

Usage

From source file:com.chimpler.example.FacetLuceneIndexer.java

License:Apache License

public static void main(String args[]) throws Exception {
    //      if (args.length != 3) {
    //         System.err.println("Parameters: [index directory] [taxonomy directory] [json file]");
    //         System.exit(1);
    //      }/*  w ww.  ja  v  a  2  s  .  c  o m*/

    String indexDirectory = "index";
    String taxonomyDirectory = "taxonomy";
    String jsonFileName = "/home/qiuqiang/workspace/facet-lucene-example/books.json";

    IndexWriterConfig writerConfig = new IndexWriterConfig(LUCENE_VERSION,
            new WhitespaceAnalyzer(LUCENE_VERSION));
    writerConfig.setOpenMode(OpenMode.APPEND);
    IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexDirectory)), writerConfig);

    TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(MMapDirectory.open(new File(taxonomyDirectory)),
            OpenMode.APPEND);

    TaxonomyReader taxonomyReader = new DirectoryTaxonomyReader(FSDirectory.open(new File(taxonomyDirectory)));

    String content = IOUtils.toString(new FileInputStream(jsonFileName));
    JSONArray bookArray = new JSONArray(content);

    Field idField = new IntField("id", 0, Store.YES);
    Field titleField = new TextField("title", "", Store.YES);
    Field authorsField = new TextField("authors", "", Store.YES);
    Field bookCategoryField = new TextField("book_category", "", Store.YES);

    indexWriter.deleteAll();

    FacetFields facetFields = new FacetFields(taxonomyWriter);

    for (int i = 0; i < bookArray.length(); i++) {
        Document document = new Document();

        JSONObject book = bookArray.getJSONObject(i);
        int id = book.getInt("id");
        String title = book.getString("title");
        String bookCategory = book.getString("book_category");

        List<CategoryPath> categoryPaths = new ArrayList<CategoryPath>();

        String authorsString = "";
        JSONArray authors = book.getJSONArray("authors");
        for (int j = 0; j < authors.length(); j++) {
            String author = authors.getString(j);
            if (j > 0) {
                authorsString += ", ";
            }
            categoryPaths.add(new CategoryPath("author", author));
            authorsString += author;
        }
        categoryPaths.add(new CategoryPath("book_category" + bookCategory, '/'));

        idField.setIntValue(id);
        titleField.setStringValue(title);
        authorsField.setStringValue(authorsString);
        bookCategoryField.setStringValue(bookCategory);

        facetFields.addFields(document, categoryPaths);

        document.add(idField);
        document.add(titleField);
        document.add(authorsField);
        document.add(bookCategoryField);

        indexWriter.addDocument(document);

        System.out.printf("Book: id=%d, title=%s, book_category=%s, authors=%s\n", id, title, bookCategory,
                authors);
    }

    taxonomyWriter.prepareCommit();
    try {
        taxonomyWriter.commit();
    } catch (Exception e) {
        taxonomyWriter.rollback();
    }

    //      taxonomyWriter.close();
    //      
    //      indexWriter.commit();
    //      indexWriter.close();

    String query = "story";

    IndexReader indexReader = DirectoryReader.open(indexWriter, false);
    IndexReader indexReader2 = DirectoryReader.open(indexWriter, false);
    System.out.println(indexReader == indexReader2);

    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    TaxonomyReader newTaxonomyReader = DirectoryTaxonomyReader.openIfChanged(taxonomyReader);
    if (newTaxonomyReader != null) {
        TaxonomyReader tmp = taxonomyReader;
        taxonomyReader = newTaxonomyReader;
        tmp.close();
    } else {
        System.out.println("null");
    }

    ArrayList<FacetRequest> facetRequests = new ArrayList<FacetRequest>();
    facetRequests.add(new CountFacetRequest(new CategoryPath("author"), 100));
    facetRequests.add(new CountFacetRequest(new CategoryPath("book_category"), 100));

    FacetSearchParams searchParams = new FacetSearchParams(facetRequests);

    ComplexPhraseQueryParser queryParser = new ComplexPhraseQueryParser(LUCENE_VERSION, "title",
            new StandardAnalyzer(LUCENE_VERSION));
    Query luceneQuery = queryParser.parse(query);

    // Collectors to get top results and facets
    TopScoreDocCollector topScoreDocCollector = TopScoreDocCollector.create(10, true);
    FacetsCollector facetsCollector = FacetsCollector.create(searchParams, indexReader, taxonomyReader);
    indexSearcher.search(luceneQuery, MultiCollector.wrap(topScoreDocCollector, facetsCollector));
    System.out.println("Found:");

    for (ScoreDoc scoreDoc : topScoreDocCollector.topDocs().scoreDocs) {
        Document document = indexReader.document(scoreDoc.doc);
        System.out.printf("- book: id=%s, title=%s, book_category=%s, authors=%s, score=%f\n",
                document.get("id"), document.get("title"), document.get("book_category"),
                document.get("authors"), scoreDoc.score);
    }

    System.out.println("Facets:");
    for (FacetResult facetResult : facetsCollector.getFacetResults()) {
        System.out.println("- " + facetResult.getFacetResultNode().label);
        for (FacetResultNode facetResultNode : facetResult.getFacetResultNode().subResults) {
            System.out.printf("    - %s (%f)\n", facetResultNode.label.toString(), facetResultNode.value);
            for (FacetResultNode subFacetResultNode : facetResultNode.subResults) {
                System.out.printf("        - %s (%f)\n", subFacetResultNode.label.toString(),
                        subFacetResultNode.value);
            }
        }
    }
    taxonomyReader.close();
    indexReader.close();

    taxonomyWriter.commit();
    taxonomyWriter.close();

    indexWriter.commit();
    indexWriter.close();

}

From source file:com.fuerve.villageelder.actions.results.SearchResultItemTest.java

License:Apache License

private void buildDummyIndex(final Directory indexDirectory, final Directory taxonomyDirectory)
        throws IOException {
    IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer());
    iwc.setOpenMode(OpenMode.CREATE);//from w  w  w  .j a va  2  s  . co m
    IndexWriter iw = new IndexWriter(indexDirectory, iwc);
    TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
    List<CategoryPath> categories = new ArrayList<CategoryPath>();
    FacetFields facetFields = new FacetFields(tw);

    Document doc = new Document();
    categories.clear();
    doc.add(new StringField("Author", "foo", Store.YES));
    categories.add(new CategoryPath("Author", "foo"));
    doc.add(new LongField("RevisionNumber", 50L, Store.YES));
    doc.add(new StringField("Revision", "50", Store.YES));
    doc.add(new TextField("Message", "stuff", Store.YES));
    iw.addDocument(doc);
    facetFields.addFields(doc, categories);

    doc = new Document();
    facetFields = new FacetFields(tw);
    categories.clear();
    doc.add(new StringField("Author", "bar", Store.YES));
    categories.add(new CategoryPath("Author", "bar"));
    doc.add(new LongField("RevisionNumber", 5000L, Store.YES));
    doc.add(new StringField("Revision", "5000", Store.YES));
    doc.add(new TextField("Message", "stuff", Store.YES));
    iw.addDocument(doc);
    facetFields.addFields(doc, categories);

    tw.commit();
    tw.close();
    iw.commit();
    iw.close();
}

From source file:com.fuerve.villageelder.search.SearcherTest.java

License:Apache License

/**
 * Test method for {@link com.fuerve.villageelder.search.Searcher#initializeSearch()}.
 *//*  w  w  w.  j a va  2 s  .  c om*/
@SuppressWarnings("unused")
@Test
public final void testInitializeSearch() throws Exception {
    // Gather declared fields.
    Field indexDirectoryField = Searcher.class.getDeclaredField("indexDirectory");
    Field taxonomyDirectoryField = Searcher.class.getDeclaredField("taxonomyDirectory");
    Field indexDirectoryNameField = Searcher.class.getDeclaredField("indexDirectoryName");
    Field taxonomyDirectoryNameField = Searcher.class.getDeclaredField("taxonomyDirectoryName");
    Field stringDirectoriesField = Searcher.class.getDeclaredField("stringDirectories");
    Field initializedField = Searcher.class.getDeclaredField("initialized");
    Field searchField = Searcher.class.getDeclaredField("search");
    Field indexReaderField = Searcher.class.getDeclaredField("indexReader");
    Field indexSearcherField = Searcher.class.getDeclaredField("indexSearcher");
    Field taxonomyReaderField = Searcher.class.getDeclaredField("taxonomyReader");

    indexDirectoryField.setAccessible(true);
    taxonomyDirectoryField.setAccessible(true);
    indexDirectoryNameField.setAccessible(true);
    taxonomyDirectoryNameField.setAccessible(true);
    stringDirectoriesField.setAccessible(true);
    initializedField.setAccessible(true);
    searchField.setAccessible(true);
    indexReaderField.setAccessible(true);
    indexSearcherField.setAccessible(true);
    taxonomyReaderField.setAccessible(true);

    // Setup
    Directory indexDirectoryExpected = new RAMDirectory();
    Directory taxonomyDirectoryExpected = new RAMDirectory();

    IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer());
    IndexWriter iw = new IndexWriter(indexDirectoryExpected, iwc);
    TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectoryExpected, OpenMode.CREATE);

    iw.commit();
    tw.commit();

    Searcher target = new Searcher(indexDirectoryExpected, taxonomyDirectoryExpected);
    target.initializeSearch();

    // Gather field values.
    Directory indexDirectoryActual = (Directory) indexDirectoryField.get(target);
    Directory taxonomyDirectoryActual = (Directory) taxonomyDirectoryField.get(target);
    String indexDirectoryNameActual = (String) indexDirectoryNameField.get(target);
    String taxonomyDirectoryNameActual = (String) taxonomyDirectoryNameField.get(target);
    boolean stringDirectoriesActual = stringDirectoriesField.getBoolean(target);
    boolean initializedActual = initializedField.getBoolean(target);
    Search searchFieldActual = (Search) searchField.get(target);
    IndexReader indexReaderActual = (IndexReader) indexReaderField.get(target);
    IndexSearcher indexSearcherActual = (IndexSearcher) indexSearcherField.get(target);
    TaxonomyReader taxonomyReaderActual = (TaxonomyReader) taxonomyReaderField.get(target);

    // Test
    assertEquals(true, initializedActual);
    assertNotNull(indexReaderActual);
    assertNotNull(indexSearcherActual);
    assertNotNull(taxonomyReaderActual);

    // Finish
    tw.close();
    iw.close();
}

From source file:com.fuerve.villageelder.search.SearchTest.java

License:Apache License

/**
 * Test method for {@link com.fuerve.villageelder.search.Search#getFacetsCollector(org.apache.lucene.index.DirectoryReader, org.apache.lucene.facet.taxonomy.TaxonomyReader)}.
 *///  ww  w.  j av  a  2 s . c o  m
@Test
@SuppressWarnings({ "unchecked", "unused" })
public final void testGetFacetsCollector() throws Exception {
    // Constants
    Field defaultSortField = Search.class.getDeclaredField("DEFAULT_SORT");
    Field defaultFacetsField = Search.class.getDeclaredField("DEFAULT_FACETS");
    Field defaultFacetStringsField = Search.class.getDeclaredField("DEFAULT_FACET_STRINGS");
    Field defaultAnalyzerField = Search.class.getDeclaredField("DEFAULT_ANALYZER");
    Field defaultHitsField = Search.class.getDeclaredField("DEFAULT_HITS");

    defaultSortField.setAccessible(true);
    defaultFacetsField.setAccessible(true);
    defaultFacetStringsField.setAccessible(true);
    defaultAnalyzerField.setAccessible(true);
    defaultHitsField.setAccessible(true);

    final Sort defaultSort = (Sort) defaultSortField.get(null);
    final List<FacetRequest> defaultFacets = (List<FacetRequest>) defaultFacetsField.get(null);
    final Map<String, Integer> defaultFacetStrings = (Map<String, Integer>) defaultFacetStringsField.get(null);
    final Analyzer defaultAnalyzer = (Analyzer) defaultAnalyzerField.get(null);
    final int defaultHits = defaultHitsField.getInt(null);

    // Private members
    Field queryField = Search.class.getDeclaredField("query");
    Field sortField = Search.class.getDeclaredField("sort");
    Field facetsField = Search.class.getDeclaredField("facets");

    queryField.setAccessible(true);
    sortField.setAccessible(true);
    facetsField.setAccessible(true);

    // Test setup
    QueryParser parser = getQueryParser();
    Query queryExpected = parser.parse("test:foo");
    List<FacetRequest> facetsExpected = new ArrayList<FacetRequest>();
    Sort sortExpected = Sort.RELEVANCE;

    Search target = new Search(queryExpected, facetsExpected, sortExpected);
    target.addFacet("test", 100);

    // Gather fields
    Query queryActual = (Query) queryField.get(target);
    Sort sortActual = (Sort) sortField.get(target);
    List<FacetRequest> facetsActual = (List<FacetRequest>) facetsField.get(target);

    // Set up some dummy indices.
    Directory indexDirectory = new RAMDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(Lucene.LUCENE_VERSION, Lucene.getPerFieldAnalyzer());
    IndexWriter iw = new IndexWriter(indexDirectory, iwc);
    Directory taxonomyDirectory = new RAMDirectory();
    TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);

    iw.commit();
    tw.commit();

    // Test
    FacetsCollector actual = target.getFacetsCollector(DirectoryReader.open(indexDirectory),
            new DirectoryTaxonomyReader(taxonomyDirectory));

    assertEquals("DocsOnlyCollector", actual.getClass().getSimpleName());
    iw.close();
    tw.close();
    taxonomyDirectory.close();
}

From source file:com.khepry.frackhem.entities.Blendeds.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName,
        Map<String, Toxicity> toxicities) throws IOException {

    String message;//from  w w  w  . j av a 2 s . c o m

    message = "Start Indexing Blendeds via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, Integer> colIndexes = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                        colIndexes.put(colHeader, i);
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        // append toxicity information to the document
                        String toxCasEdfId = document.get(casEdfIdFieldName).trim();
                        Toxicity toxicity = new Toxicity();
                        if (toxicities.containsKey(toxCasEdfId)) {
                            toxicity = toxicities.get(toxCasEdfId);
                            document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxChemicalName().trim());
                            sb.append(" ");
                            document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxRecognized().trim());
                            sb.append(" ");
                            document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxSuspected().trim());
                            sb.append(" ");
                        } else {
                            document.add(new TextField("toxChemicalName", "", Store.YES));
                            document.add(new TextField("toxRecognized", "", Store.YES));
                            document.add(new TextField("toxSuspected", "", Store.YES));
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxChemical = toxicity.getToxChemicalName().trim();

                        // categorize recognized toxicities
                        String toxRecognized = toxicity.getToxRecognized().trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = toxicity.getToxSuspected().trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Blendeds via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Chemicals.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName,
        Map<String, Toxicity> toxicities) throws IOException {

    String message;//from   w w  w  . ja  v a2s  .c  o  m

    message = "Start Indexing Chemicals via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, Integer> colIndexes = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                        colIndexes.put(colHeader, i);
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        // append toxicity information to the document
                        String toxCasEdfId = document.get(casEdfIdFieldName).trim();
                        Toxicity toxicity = new Toxicity();
                        if (toxicities.containsKey(toxCasEdfId)) {
                            toxicity = toxicities.get(toxCasEdfId);
                            document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxChemicalName().trim());
                            sb.append(" ");
                            document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxRecognized().trim());
                            sb.append(" ");
                            document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxSuspected().trim());
                            sb.append(" ");
                        } else {
                            document.add(new TextField("toxChemicalName", "", Store.YES));
                            document.add(new TextField("toxRecognized", "", Store.YES));
                            document.add(new TextField("toxSuspected", "", Store.YES));
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxChemical = toxicity.getToxChemicalName().trim();

                        // categorize recognized toxicities
                        String toxRecognized = toxicity.getToxRecognized().trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = toxicity.getToxSuspected().trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Chemicals via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Reports.java

License:Apache License

public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities,
        String... parseFields) throws IOException {

    String message;//w  w w . jav  a  2s .c o m

    message = "Start Indexing Reports via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textPath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);

        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }

            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }

            Map<String, String> mapBreakFields = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapLevelFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();
            Map<String, Integer> mapColIndexes = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, "");
            }

            pieces = levelFields.split(",");
            for (String levelField : pieces) {
                mapBreakFields.put(levelField, "");
                mapLevelFields.put(levelField, "");
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>();
            for (String parseField : parseFields) {
                mapToxValues.put(parseField, new TreeMap<String, String>());
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;

            StringBuilder sbIndex = new StringBuilder();
            StringBuilder sbLevel = new StringBuilder();

            Integer outCount = 0;
            Integer rcdCount = 0;

            Boolean firstDataRecordHandled = false;

            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        mapColIndexes.put(colHeader.trim(), i);
                        i++;
                    }
                } else {
                    for (String key : mapLevelFields.keySet()) {
                        if (mapColIndexes.containsKey(key)) {
                            String value = pieces[mapColIndexes.get(key)].trim();
                            // build up level-break values
                            if (mapLevelFields.containsKey(key)) {
                                mapLevelFields.put(key, value);
                            }
                        }
                    }
                    if (!firstDataRecordHandled) {
                        mapBreakFields.putAll(mapLevelFields);
                        firstDataRecordHandled = true;
                    }
                    // if there is a "level break"
                    if (!mapLevelFields.equals(mapBreakFields)) {
                        Document tgtDocument = new Document();
                        for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) {
                            Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES);
                            tgtDocument.add(field);
                        }
                        for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) {
                            String fieldName = toxEntry.getKey();
                            String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " ");
                            // System.out.println(fieldName + ": " + fieldValue);
                            sbIndex.append(fieldValue);
                            sbIndex.append(" ");
                            tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES));
                            // build up "Toxicity" taxonomy categories
                            for (String value : fieldValue.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value));
                                }
                            }
                            // build up "stats" taxonomy categories
                            for (String statsKey : mapStatsFields.keySet()) {
                                if (mapLevelFields.containsKey(statsKey)) {
                                    String levelValue = mapLevelFields.get(statsKey);
                                    if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) {
                                        taxonomyCategories
                                                .add(new CategoryPath("Reports", statsKey, levelValue));
                                    }
                                }
                            }
                        }
                        tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO));
                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(tgtDocument, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }
                        indexWriter.addDocument(tgtDocument);
                        outCount++;
                        sbIndex.setLength(0);
                        for (String key : mapToxValues.keySet()) {
                            mapToxValues.get(key).clear();
                        }
                        taxonomyCategories.clear();
                        mapBreakFields.putAll(mapLevelFields);
                    }
                    // build up text index values
                    for (String key : mapLevelFields.keySet()) {
                        if (mapColIndexes.containsKey(key)) {
                            String value = pieces[mapColIndexes.get(key)].trim();
                            if (!value.equals("")) {
                                // build up 'text' field index value
                                if (mapIndexFields.containsKey(key)) {
                                    sbIndex.append(value);
                                    sbIndex.append(" ");
                                }
                            }
                        }
                    }
                    // build up toxicity values for later level-break use
                    if (mapColIndexes.containsKey(casEdfIdFieldName)) {
                        Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim());
                        if (toxicity != null) {
                            // build up recognized toxicity values
                            String[] toxRValues = toxicity.getToxRecognized().split(",");
                            for (String toxValue : toxRValues) {
                                if (!toxValue.equals("")) {
                                    if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) {
                                        mapToxValues.get("toxRecognized").put(toxValue, toxValue);
                                    }
                                }
                            }
                            // build up suspected toxicity values
                            String[] toxSValues = toxicity.getToxSuspected().split(",");
                            for (String toxValue : toxSValues) {
                                if (!toxValue.equals("")) {
                                    if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) {
                                        mapToxValues.get("toxSuspected").put(toxValue, toxValue);
                                    }
                                }
                            }
                        }
                    }
                    if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                        message = "Records indexed: " + rcdCount;
                        if (outputToSystemOut) {
                            System.out.println(message);
                        }
                        if (outputToMsgQueue) {
                            progressMessageQueue.send(new MessageInput(message));
                        }
                    }
                }
            }
            br.close();
            // handle end-of-file processing
            Document tgtDocument = new Document();
            for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) {
                Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES);
                tgtDocument.add(field);
            }
            for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) {
                String fieldName = toxEntry.getKey();
                String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " ");
                // System.out.println(fieldName + ": " + fieldValue);
                sbIndex.append(fieldValue);
                sbIndex.append(" ");
                tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES));
                // build up "Toxicity" taxonomy categories
                for (String value : fieldValue.replace(" ", ",").split(",")) {
                    if (!value.trim().equals("")) {
                        taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value));
                    }
                }
                // build up "stats" taxonomy categories
                for (String statsKey : mapStatsFields.keySet()) {
                    if (mapLevelFields.containsKey(statsKey)) {
                        String levelValue = mapLevelFields.get(statsKey);
                        if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) {
                            taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue));
                        }
                    }
                }
            }
            tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO));
            if (taxonomyCategories.size() > 0) {
                facetFields.addFields(tgtDocument, taxonomyCategories);
                // System.out.println("Taxonomies added: " +
                // taxonomyCategories.size());
            }
            indexWriter.addDocument(tgtDocument);
            outCount++;
            message = "Records processed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
            message = "Records indexed: " + outCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sbIndex.setLength(0);
            sbIndex.trimToSize();

            sbLevel.setLength(0);
            sbLevel.trimToSize();

            mapToxValues.clear();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            analyzer.close();
            indexDirectory.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Reports via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Toxicities.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator) throws IOException {

    String message;//from w  w w.j  av  a  2 s  .  c  om

    message = "Start Indexing Toxicities via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxCasEdfId = pieces[0].trim();
                        String toxChemical = pieces[1].trim();

                        // categorize recognized toxicities
                        String toxRecognized = pieces[2].trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = pieces[3].trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("Toxicities", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
        }
        message = "Ended Indexing Toxicities via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:edu.harvard.iq.dvn.core.index.Indexer.java

License:Apache License

protected void addDocument(Study study) throws IOException {

    StudyVersion sv = null;// w  w w  .  jav a 2 s .  c  om
    if (study.getReleasedVersion() != null) {
        sv = study.getReleasedVersion();
        Metadata metadata = sv.getMetadata();

        Document doc = new Document();
        logger.fine("Start indexing study " + study.getStudyId());
        addText(4.0f, doc, "title", metadata.getTitle());
        addKeyword(doc, "id", study.getId().toString());
        addText(1.0f, doc, "studyId", study.getStudyId());
        addKeyword(doc, "studyId", study.getStudyId());
        //        addText(1.0f,  doc,"owner",study.getOwner().getName());
        addText(1.0f, doc, "dvOwnerId", Long.toString(study.getOwner().getId()));
        String dvNetworkId = study.getOwner().getVdcNetwork().getId().toString();
        /* This is the ID of the DV Network to which the study belongs 
         * directly, through its owner DV:
         */
        addText(1.0f, doc, "ownerDvNetworkId", dvNetworkId);
        /* Plus it may belong to these extra Networks, through linking into
         * collections in DVs that belong to other Networks:
         */
        logger.fine("Using network id " + dvNetworkId);
        addText(1.0f, doc, "dvNetworkId", dvNetworkId);
        List<Long> linkedToNetworks = study.getLinkedToNetworkIds();
        if (linkedToNetworks != null) {
            for (Long vdcnetworkid : linkedToNetworks) {
                addText(1.0f, doc, "dvNetworkId", vdcnetworkid.toString());
            }
        }
        addDate(1.0f, doc, "productionDate", metadata.getProductionDate());
        addDate(1.0f, doc, "distributionDate", metadata.getDistributionDate());

        Collection<StudyKeyword> keywords = metadata.getStudyKeywords();
        for (Iterator it = keywords.iterator(); it.hasNext();) {
            StudyKeyword elem = (StudyKeyword) it.next();
            addText(1.0f, doc, "keywordValue", elem.getValue());
        }
        Collection<StudyTopicClass> topicClassifications = metadata.getStudyTopicClasses();
        for (Iterator it = topicClassifications.iterator(); it.hasNext();) {
            StudyTopicClass elem = (StudyTopicClass) it.next();
            addText(1.0f, doc, "topicClassValue", elem.getValue());
            addText(1.0f, doc, "topicVocabClassURI", elem.getVocabURI());
            addText(1.0f, doc, "topicClassVocabulary", elem.getVocab());
        }
        Collection<StudyAbstract> abstracts = metadata.getStudyAbstracts();
        for (Iterator it = abstracts.iterator(); it.hasNext();) {
            StudyAbstract elem = (StudyAbstract) it.next();
            addText(2.0f, doc, "abstractText", elem.getText());
            addDate(1.0f, doc, "abstractDate", elem.getDate());

        }
        Collection<StudyAuthor> studyAuthors = metadata.getStudyAuthors();
        for (Iterator it = studyAuthors.iterator(); it.hasNext();) {
            StudyAuthor elem = (StudyAuthor) it.next();
            addText(3.0f, doc, "authorName", elem.getName());
            addText(1.0f, doc, "authorName", elem.getName());
            addText(1.0f, doc, "authorAffiliation", elem.getAffiliation());
        }
        Collection<StudyProducer> studyProducers = metadata.getStudyProducers();
        for (Iterator itProducers = studyProducers.iterator(); itProducers.hasNext();) {
            StudyProducer studyProducer = (StudyProducer) itProducers.next();
            addText(1.0f, doc, "producerName", studyProducer.getName());
            addText(1.0f, doc, "producerName", studyProducer.getAbbreviation());
            addText(1.0f, doc, "producerName", studyProducer.getLogo());
            addText(1.0f, doc, "producerName", studyProducer.getUrl());
            addText(1.0f, doc, "producerName", studyProducer.getAffiliation());
            addText(1.0f, doc, "producerName", studyProducer.getMetadata().getProductionPlace());
        }
        Collection<StudyDistributor> studyDistributors = metadata.getStudyDistributors();
        for (Iterator it = studyDistributors.iterator(); it.hasNext();) {
            StudyDistributor studyDistributor = (StudyDistributor) it.next();
            addText(1.0f, doc, "distributorName", studyDistributor.getName());
            addText(1.0f, doc, "distributorName", studyDistributor.getAbbreviation());
            addText(1.0f, doc, "distributorName", studyDistributor.getLogo());
            addText(1.0f, doc, "distributorName", studyDistributor.getUrl());
            addText(1.0f, doc, "distributorName", studyDistributor.getAffiliation());
        }
        Collection<StudyOtherId> otherIds = metadata.getStudyOtherIds();
        for (Iterator it = otherIds.iterator(); it.hasNext();) {
            StudyOtherId elem = (StudyOtherId) it.next();
            addText(1.0f, doc, "otherId", elem.getOtherId());
            addText(1.0f, doc, "otherIdAgency", elem.getAgency());
        }
        addText(1.0f, doc, "fundingAgency", metadata.getFundingAgency());
        addText(1.0f, doc, "distributorContact", metadata.getDistributorContact());
        addText(1.0f, doc, "distributorContactAffiliation", metadata.getDistributorContactAffiliation());
        addText(1.0f, doc, "distributorContactEmail", metadata.getDistributorContactEmail());
        addDate(1.0f, doc, "dateOfDeposit", metadata.getDateOfDeposit());
        addText(1.0f, doc, "depositor", metadata.getDepositor());
        addText(1.0f, doc, "seriesName", metadata.getSeriesName());
        addText(1.0f, doc, "seriesInformation", metadata.getSeriesInformation());
        addText(1.0f, doc, "studyVersion", metadata.getStudyVersionText());
        addText(1.0f, doc, "versionDate", metadata.getVersionDate());
        addText(1.0f, doc, "originOfSources", metadata.getOriginOfSources());
        addText(1.0f, doc, "dataSources", metadata.getDataSources());
        addText(1.0f, doc, "frequencyOfDataCollection", metadata.getFrequencyOfDataCollection());
        addText(1.0f, doc, "universe", metadata.getUniverse());
        addText(1.0f, doc, "unitOfAnalysis", metadata.getUnitOfAnalysis());
        addText(1.0f, doc, "dataCollector", metadata.getDataCollector());
        addText(1.0f, doc, "kindOfData", metadata.getKindOfData());
        addText(1.0f, doc, "geographicCoverage", metadata.getGeographicCoverage());
        addText(1.0f, doc, "geographicUnit", metadata.getGeographicUnit());
        addDate(1.0f, doc, "timePeriodCoveredEnd", metadata.getTimePeriodCoveredEnd());
        addDate(1.0f, doc, "timePeriodCoveredStart", metadata.getTimePeriodCoveredStart());
        addDate(1.0f, doc, "dateOfCollection", metadata.getDateOfCollectionStart());
        addDate(1.0f, doc, "dateOfCollectionEnd", metadata.getDateOfCollectionEnd());
        addText(1.0f, doc, "country", metadata.getCountry());
        addText(1.0f, doc, "timeMethod", metadata.getTimeMethod());
        addText(1.0f, doc, "samplingProcedure", metadata.getSamplingProcedure());
        addText(1.0f, doc, "deviationsFromSampleDesign", metadata.getDeviationsFromSampleDesign());
        addText(1.0f, doc, "collectionMode", metadata.getCollectionMode());
        addText(1.0f, doc, "researchInstrument", metadata.getResearchInstrument());
        addText(1.0f, doc, "characteristicOfSources", metadata.getCharacteristicOfSources());
        addText(1.0f, doc, "accessToSources", metadata.getAccessToSources());
        addText(1.0f, doc, "dataCollectionSituation", metadata.getDataCollectionSituation());
        addText(1.0f, doc, "actionsToMinimizeLoss", metadata.getActionsToMinimizeLoss());
        addText(1.0f, doc, "controlOperations", metadata.getControlOperations());
        addText(1.0f, doc, "weighting", metadata.getWeighting());
        addText(1.0f, doc, "cleaningOperations", metadata.getCleaningOperations());
        addText(1.0f, doc, "studyLevelErrorNotes", metadata.getStudyLevelErrorNotes());
        List<StudyNote> studyNotes = metadata.getStudyNotes();
        for (Iterator it = studyNotes.iterator(); it.hasNext();) {
            StudyNote elem = (StudyNote) it.next();
            addText(1.0f, doc, "studyNoteType", elem.getType());
            addText(1.0f, doc, "studyNoteSubject", elem.getSubject());
            addText(1.0f, doc, "studyNoteText", elem.getText());
        }
        addText(1.0f, doc, "responseRate", metadata.getResponseRate());
        addText(1.0f, doc, "samplingErrorEstimate", metadata.getSamplingErrorEstimate());
        addText(1.0f, doc, "otherDataAppraisal", metadata.getOtherDataAppraisal());
        addText(1.0f, doc, "placeOfAccess", metadata.getPlaceOfAccess());
        addText(1.0f, doc, "originalArchive", metadata.getOriginalArchive());
        addText(1.0f, doc, "availabilityStatus", metadata.getAvailabilityStatus());
        addText(1.0f, doc, "collectionSize", metadata.getCollectionSize());
        addText(1.0f, doc, "studyCompletion", metadata.getStudyCompletion());
        addText(1.0f, doc, "confidentialityDeclaration", metadata.getConfidentialityDeclaration());
        addText(1.0f, doc, "specialPermissions", metadata.getSpecialPermissions());
        addText(1.0f, doc, "restrictions", metadata.getRestrictions());
        addText(1.0f, doc, "contact", metadata.getContact());
        addText(1.0f, doc, "citationRequirements", metadata.getCitationRequirements());
        addText(1.0f, doc, "depositorRequirements", metadata.getDepositorRequirements());
        addText(1.0f, doc, "conditions", metadata.getConditions());
        addText(1.0f, doc, "disclaimer", metadata.getDisclaimer());
        List<StudyRelMaterial> relMaterials = metadata.getStudyRelMaterials();
        for (Iterator it = relMaterials.iterator(); it.hasNext();) {
            StudyRelMaterial elem = (StudyRelMaterial) it.next();
            addText(1.0f, doc, "relatedMaterial", elem.getText());
        }
        List<StudyRelStudy> relStudies = metadata.getStudyRelStudies();
        for (Iterator it = relStudies.iterator(); it.hasNext();) {
            StudyRelStudy elem = (StudyRelStudy) it.next();
            addText(1.0f, doc, "relatedStudy", elem.getText());
        }
        List<StudyOtherRef> otherRefs = metadata.getStudyOtherRefs();
        for (Iterator it = otherRefs.iterator(); it.hasNext();) {
            StudyOtherRef elem = (StudyOtherRef) it.next();
            addText(1.0f, doc, "otherReferences", elem.getText());
        }

        for (StudyRelPublication elem : metadata.getStudyRelPublications()) {
            String publicationId = (elem.getIdType() != null ? elem.getIdType() + ":" : "")
                    + elem.getIdNumber();
            if (elem.isReplicationData()) {
                addText(1.0f, doc, "replicationFor", elem.getText());
                addText(1.0f, doc, "replicationForId", publicationId);
                addText(1.0f, doc, "replicationForURL", elem.getUrl());
            } else {
                addText(1.0f, doc, "relatedPublications", elem.getText());
                addText(1.0f, doc, "relatedPublicationsId", publicationId);
                addText(1.0f, doc, "relatedPublicationsURL", elem.getUrl());
            }
        }

        /*     addText(1.0f,  doc,"relatedMaterial",metadata.getRelatedMaterial());
        addText(1.0f,  doc,"relatedPublications",metadata.getRelatedPublications());
        addText(1.0f,  doc,"otherReferences",metadata.getOtherReferences());
         */
        addText(1.0f, doc, "subtitle", metadata.getSubTitle());
        List<StudyKeyword> studyKeywords = metadata.getStudyKeywords();
        for (Iterator it = studyKeywords.iterator(); it.hasNext();) {
            StudyKeyword elem = (StudyKeyword) it.next();
            addText(1.0f, doc, "keywordVocabulary", elem.getVocab());
            addText(1.0f, doc, "keywordVocabulary", elem.getVocabURI());
        }
        addText(1.0f, doc, "protocol", study.getProtocol());
        addText(1.0f, doc, "authority", study.getAuthority());
        addText(1.0f, doc, "globalId", study.getGlobalId());
        List<StudySoftware> studySoftware = metadata.getStudySoftware();
        for (Iterator it = studySoftware.iterator(); it.hasNext();) {
            StudySoftware elem = (StudySoftware) it.next();
            addText(1.0f, doc, "studySoftware", elem.getName());
            addText(1.0f, doc, "studySoftwareVersion", elem.getSoftwareVersion());
        }
        List<StudyGrant> studyGrants = metadata.getStudyGrants();
        for (Iterator it = studyGrants.iterator(); it.hasNext();) {
            StudyGrant elem = (StudyGrant) it.next();
            addText(1.0f, doc, "studyGrantNumber", elem.getNumber());
            addText(1.0f, doc, "studyGrantNumberAgency", elem.getAgency());
        }
        List<StudyGeoBounding> studyGeoBounding = metadata.getStudyGeoBoundings();
        for (Iterator it = studyGeoBounding.iterator(); it.hasNext();) {
            StudyGeoBounding elem = (StudyGeoBounding) it.next();
            addText(1.0f, doc, "studyEastLongitude", elem.getEastLongitude());
            addText(1.0f, doc, "studyWestLongitude", elem.getWestLongitude());
            addText(1.0f, doc, "studyNorthLatitude", elem.getNorthLatitude());
            addText(1.0f, doc, "studySouthLatitude", elem.getSouthLatitude());
        }

        // Extented metadata fields: 

        String templateName = metadata.getStudy().getTemplate().getName();

        for (StudyFieldValue extFieldValue : metadata.getStudyFieldValues()) {
            try {
                StudyField extStudyField = extFieldValue.getStudyField();
                String extFieldName = extStudyField.getName();
                String extFieldStrValue = extFieldValue.getStrValue();

                if (extFieldName != null && !extFieldName.equals("") && extFieldStrValue != null
                        && !extFieldStrValue.equals("")) {

                    addText(2.0f, doc, extFieldName, extFieldStrValue);

                    // Whenever we encounter an extended field actually 
                    // used in a study metadata, we want it to be searchable,
                    // on the "Advanced Search" page: (or do we?)

                    //extFieldValue.getTemplateField().getStudyField().setAdvancedSearchField(true);

                    // note that the above will only control the appearance of the 
                    // field on the Network-level Advanced Search page. (that 
                    // page uses the default list of advanced search fields, 
                    // which is simply all the lists from the StudyField DB
                    // table where isAdvancedField=true. Individual DVs 
                    // have their own lists of advanced fields. 
                    // As of now, we will make the field "advanced" only in
                    // its own dataverse: 
                    // (this is to be reviewed with Merce next week -- L.A. Feb. 22, 2012)

                    if (!metadata.getStudy().getOwner().getAdvSearchFields().contains(extStudyField)) {
                        metadata.getStudy().getOwner().getAdvSearchFields().add(extStudyField);
                    }

                }

            } catch (Exception ex) {
                // do nothing - if we can't retrieve the field, we are 
                // not going to index it, that's all.  
            }
        }

        for (FileMetadata fileMetadata : sv.getFileMetadatas()) {
            addText(1.0f, doc, "fileDescription", fileMetadata.getDescription());

        }

        addText(1.0f, doc, "unf", metadata.getUNF());
        //        writer = new IndexWriter(dir, true, getAnalyzer(), isIndexEmpty());
        logger.fine("Indexing study db id " + study.getId() + " (" + study.getStudyId() + ": "
                + metadata.getTitle() + ") from dataverse id " + study.getOwner().getId() + " ("
                + study.getOwner().getAlias() + ")");
        writer = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
        TaxonomyWriter taxo = new DirectoryTaxonomyWriter(taxoDir);
        List<CategoryPath> categoryPaths = new ArrayList<CategoryPath>();
        addFacet(categoryPaths, "dvName", study.getOwner().getName());
        addFacetDate(categoryPaths, "productionDate", metadata.getProductionDate());
        addFacetDate(categoryPaths, "distributionDate", metadata.getDistributionDate());
        for (Iterator it = studyDistributors.iterator(); it.hasNext();) {
            StudyDistributor studyDistributor = (StudyDistributor) it.next();
            addFacet(categoryPaths, "distributorName", studyDistributor.getName());
        }
        for (Iterator it = studyAuthors.iterator(); it.hasNext();) {
            StudyAuthor elem = (StudyAuthor) it.next();
            addFacet(categoryPaths, "authorName", elem.getName());
            addFacet(categoryPaths, "authorAffiliation", elem.getAffiliation());
        }
        addFacet(categoryPaths, "country", metadata.getCountry());
        for (Iterator it = keywords.iterator(); it.hasNext();) {
            StudyKeyword elem = (StudyKeyword) it.next();
            addFacet(categoryPaths, "keywordValue", elem.getValue());
        }
        for (Iterator it = topicClassifications.iterator(); it.hasNext();) {
            StudyTopicClass elem = (StudyTopicClass) it.next();
            if (elem.getValue() != null && (!elem.getValue().equals("")) && elem.getVocab() != null
                    && (!elem.getVocab().equals(""))) {
                addFacet(categoryPaths, "topicClassValueParensVocab",
                        elem.getValue().trim() + " (" + elem.getVocab().trim() + ")");
            }
        }

        CategoryDocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo);
        categoryDocBuilder.setCategoryPaths(categoryPaths);
        categoryDocBuilder.build(doc);
        writer.addDocument(doc);
        // warnings from https://svn.apache.org/repos/asf/lucene/dev/tags/lucene_solr_3_5_0/lucene/contrib/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleIndexer.java
        // we commit changes to the taxonomy index prior to committing them to the search index.
        // this is important, so that all facets referred to by documents in the search index 
        // will indeed exist in the taxonomy index.
        taxo.commit();
        writer.commit();
        // close the taxonomy index and the index - all modifications are 
        // now safely in the provided directories: indexDir and taxoDir.
        taxo.close();
        writer.close();

        writerVar = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(), IndexWriter.MaxFieldLength.UNLIMITED);

        StudyFile studyFile = null;
        DataTable dataTable = null;
        List<DataVariable> dataVariables = null;

        for (FileMetadata fileMetadata : sv.getFileMetadatas()) {
            //TODO: networkDataFile
            studyFile = fileMetadata.getStudyFile();
            if (studyFile instanceof TabularDataFile) {
                dataTable = ((TabularDataFile) studyFile).getDataTable();
                if (dataTable != null) {
                    dataVariables = dataTable.getDataVariables();
                    for (int j = 0; j < dataVariables.size(); j++) {
                        Document docVariables = new Document();
                        addText(1.0f, docVariables, "varStudyId", study.getId().toString());
                        addText(1.0f, docVariables, "varStudyFileId", studyFile.getId().toString());
                        DataVariable dataVariable = dataVariables.get(j);
                        addText(1.0f, docVariables, "varId", dataVariable.getId().toString());
                        addText(1.0f, docVariables, "varName", dataVariable.getName());
                        addText(1.0f, docVariables, "varLabel", dataVariable.getLabel());
                        writerVar.addDocument(docVariables);
                    }
                    dataVariables = null;
                    dataTable = null;
                }
            }
            studyFile = null;
        }

        writerVar.close();

        writerFileMeta = new IndexWriter(dir, getAnalyzer(), isIndexEmpty(),
                IndexWriter.MaxFieldLength.UNLIMITED);

        for (FileMetadata fileMetadata : sv.getFileMetadatas()) {
            studyFile = fileMetadata.getStudyFile();
            if (studyFile instanceof SpecialOtherFile) {
                Document docFileMetadata = new Document();
                // the "id" is the database id of the *study*; - for 
                // compatibility with the study-level index files. 
                addKeyword(docFileMetadata, "id", study.getId().toString());
                addText(1.0f, docFileMetadata, "studyFileId", studyFile.getId().toString());

                List<FileMetadataFieldValue> fileMetadataFieldValues = fileMetadata.getStudyFile()
                        .getFileMetadataFieldValues();
                for (int j = 0; j < fileMetadataFieldValues.size(); j++) {

                    String fieldValue = fileMetadataFieldValues.get(j).getStrValue();

                    FileMetadataField fmf = fileMetadataFieldValues.get(j).getFileMetadataField();
                    String fileMetadataFieldName = fmf.getName();
                    String fileMetadataFieldFormatName = fmf.getFileFormatName();
                    String indexFileName = fileMetadataFieldFormatName + "-" + fileMetadataFieldName;

                    addText(1.0f, docFileMetadata, indexFileName, fieldValue);

                }
                writerFileMeta.addDocument(docFileMetadata);
            }
            studyFile = null;
        }

        writerFileMeta.close();

        writerVersions = new IndexWriter(dir, new WhitespaceAnalyzer(), isIndexEmpty(),
                IndexWriter.MaxFieldLength.UNLIMITED);
        for (StudyVersion version : study.getStudyVersions()) {
            // The current(released) version UNF is indexed in the main document
            // only index previous(archived) version UNFs here
            if (version.isArchived()) {
                Document docVersions = new Document();
                addKeyword(docVersions, "versionStudyId", study.getId().toString());
                addText(1.0f, docVersions, "versionId", version.getId().toString());
                addText(1.0f, docVersions, "versionNumber", version.getVersionNumber().toString());
                addKeyword(docVersions, "versionUnf", version.getMetadata().getUNF());
                writerVersions.addDocument(docVersions);
            }
        }
        writerVersions.close();
        logger.fine("End indexing study " + study.getStudyId());
    }
}

From source file:org.wso2.carbon.analytics.dataservice.core.indexing.AnalyticsDataIndexer.java

License:Open Source License

private void updateIndex(int shardIndex, List<Record> recordBatch, Map<String, ColumnDefinition> columns)
        throws AnalyticsIndexException {
    if (log.isDebugEnabled()) {
        log.debug("Updating data in local index [" + shardIndex + "]: " + recordBatch.size());
    }/*w w w  .j  av  a 2  s . c  om*/
    Record firstRecord = recordBatch.get(0);
    int tenantId = firstRecord.getTenantId();
    String tableName = firstRecord.getTableName();
    String tableId = this.generateTableId(tenantId, tableName);
    IndexWriter indexWriter = this.lookupIndexWriter(shardIndex, tableId);
    TaxonomyWriter taxonomyWriter = this.lookupTaxonomyIndexWriter(shardIndex, tableId);
    try {
        for (Record record : recordBatch) {
            indexWriter.updateDocument(new Term(INDEX_ID_INTERNAL_FIELD, record.getId()),
                    this.generateIndexDoc(record, columns, taxonomyWriter).getFields());
        }
        indexWriter.commit();
        taxonomyWriter.commit();
        if (this.isIndexingStatsEnabled()) {
            this.statsCollector.processedRecords(recordBatch.size());
        }
    } catch (IOException e) {
        throw new AnalyticsIndexException("Error in updating index: " + e.getMessage(), e);
    }
}