Example usage for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:com.khepry.frackhem.entities.Blendeds.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName,
        Map<String, Toxicity> toxicities) throws IOException {

    String message;//from  w w w.  jav a2s.com

    message = "Start Indexing Blendeds via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, Integer> colIndexes = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                        colIndexes.put(colHeader, i);
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        // append toxicity information to the document
                        String toxCasEdfId = document.get(casEdfIdFieldName).trim();
                        Toxicity toxicity = new Toxicity();
                        if (toxicities.containsKey(toxCasEdfId)) {
                            toxicity = toxicities.get(toxCasEdfId);
                            document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxChemicalName().trim());
                            sb.append(" ");
                            document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxRecognized().trim());
                            sb.append(" ");
                            document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxSuspected().trim());
                            sb.append(" ");
                        } else {
                            document.add(new TextField("toxChemicalName", "", Store.YES));
                            document.add(new TextField("toxRecognized", "", Store.YES));
                            document.add(new TextField("toxSuspected", "", Store.YES));
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxChemical = toxicity.getToxChemicalName().trim();

                        // categorize recognized toxicities
                        String toxRecognized = toxicity.getToxRecognized().trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = toxicity.getToxSuspected().trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("Blendeds", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Blendeds via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Chemicals.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator, String casEdfIdFieldName,
        Map<String, Toxicity> toxicities) throws IOException {

    String message;//from  w w  w  . ja va2s .  c om

    message = "Start Indexing Chemicals via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, Integer> colIndexes = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                        colIndexes.put(colHeader, i);
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        // append toxicity information to the document
                        String toxCasEdfId = document.get(casEdfIdFieldName).trim();
                        Toxicity toxicity = new Toxicity();
                        if (toxicities.containsKey(toxCasEdfId)) {
                            toxicity = toxicities.get(toxCasEdfId);
                            document.add(new TextField("toxChemicalName", toxicity.getToxChemicalName().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxChemicalName().trim());
                            sb.append(" ");
                            document.add(new TextField("toxRecognized", toxicity.getToxRecognized().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxRecognized().trim());
                            sb.append(" ");
                            document.add(new TextField("toxSuspected", toxicity.getToxSuspected().trim(),
                                    Store.YES));
                            sb.append(toxicity.getToxSuspected().trim());
                            sb.append(" ");
                        } else {
                            document.add(new TextField("toxChemicalName", "", Store.YES));
                            document.add(new TextField("toxRecognized", "", Store.YES));
                            document.add(new TextField("toxSuspected", "", Store.YES));
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxChemical = toxicity.getToxChemicalName().trim();

                        // categorize recognized toxicities
                        String toxRecognized = toxicity.getToxRecognized().trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = toxicity.getToxSuspected().trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("Chemicals", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Chemicals via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Reports.java

License:Apache License

public void indexViaLucene(String textPath, String textColSeparator, Map<String, Toxicity> toxicities,
        String... parseFields) throws IOException {

    String message;//from www  .  jav a 2 s .c  om

    message = "Start Indexing Reports via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textPath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);

        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }

            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }

            Map<String, String> mapBreakFields = new LinkedHashMap<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapLevelFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();
            Map<String, Integer> mapColIndexes = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, "");
            }

            pieces = levelFields.split(",");
            for (String levelField : pieces) {
                mapBreakFields.put(levelField, "");
                mapLevelFields.put(levelField, "");
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            Map<String, Map<String, String>> mapToxValues = new LinkedHashMap<>();
            for (String parseField : parseFields) {
                mapToxValues.put(parseField, new TreeMap<String, String>());
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;

            StringBuilder sbIndex = new StringBuilder();
            StringBuilder sbLevel = new StringBuilder();

            Integer outCount = 0;
            Integer rcdCount = 0;

            Boolean firstDataRecordHandled = false;

            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    int i = 0;
                    for (String colHeader : pieces) {
                        mapColIndexes.put(colHeader.trim(), i);
                        i++;
                    }
                } else {
                    for (String key : mapLevelFields.keySet()) {
                        if (mapColIndexes.containsKey(key)) {
                            String value = pieces[mapColIndexes.get(key)].trim();
                            // build up level-break values
                            if (mapLevelFields.containsKey(key)) {
                                mapLevelFields.put(key, value);
                            }
                        }
                    }
                    if (!firstDataRecordHandled) {
                        mapBreakFields.putAll(mapLevelFields);
                        firstDataRecordHandled = true;
                    }
                    // if there is a "level break"
                    if (!mapLevelFields.equals(mapBreakFields)) {
                        Document tgtDocument = new Document();
                        for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) {
                            Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES);
                            tgtDocument.add(field);
                        }
                        for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) {
                            String fieldName = toxEntry.getKey();
                            String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " ");
                            // System.out.println(fieldName + ": " + fieldValue);
                            sbIndex.append(fieldValue);
                            sbIndex.append(" ");
                            tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES));
                            // build up "Toxicity" taxonomy categories
                            for (String value : fieldValue.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value));
                                }
                            }
                            // build up "stats" taxonomy categories
                            for (String statsKey : mapStatsFields.keySet()) {
                                if (mapLevelFields.containsKey(statsKey)) {
                                    String levelValue = mapLevelFields.get(statsKey);
                                    if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) {
                                        taxonomyCategories
                                                .add(new CategoryPath("Reports", statsKey, levelValue));
                                    }
                                }
                            }
                        }
                        tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO));
                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(tgtDocument, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }
                        indexWriter.addDocument(tgtDocument);
                        outCount++;
                        sbIndex.setLength(0);
                        for (String key : mapToxValues.keySet()) {
                            mapToxValues.get(key).clear();
                        }
                        taxonomyCategories.clear();
                        mapBreakFields.putAll(mapLevelFields);
                    }
                    // build up text index values
                    for (String key : mapLevelFields.keySet()) {
                        if (mapColIndexes.containsKey(key)) {
                            String value = pieces[mapColIndexes.get(key)].trim();
                            if (!value.equals("")) {
                                // build up 'text' field index value
                                if (mapIndexFields.containsKey(key)) {
                                    sbIndex.append(value);
                                    sbIndex.append(" ");
                                }
                            }
                        }
                    }
                    // build up toxicity values for later level-break use
                    if (mapColIndexes.containsKey(casEdfIdFieldName)) {
                        Toxicity toxicity = toxicities.get(pieces[mapColIndexes.get(casEdfIdFieldName)].trim());
                        if (toxicity != null) {
                            // build up recognized toxicity values
                            String[] toxRValues = toxicity.getToxRecognized().split(",");
                            for (String toxValue : toxRValues) {
                                if (!toxValue.equals("")) {
                                    if (!mapToxValues.get("toxRecognized").containsKey(toxValue)) {
                                        mapToxValues.get("toxRecognized").put(toxValue, toxValue);
                                    }
                                }
                            }
                            // build up suspected toxicity values
                            String[] toxSValues = toxicity.getToxSuspected().split(",");
                            for (String toxValue : toxSValues) {
                                if (!toxValue.equals("")) {
                                    if (!mapToxValues.get("toxSuspected").containsKey(toxValue)) {
                                        mapToxValues.get("toxSuspected").put(toxValue, toxValue);
                                    }
                                }
                            }
                        }
                    }
                    if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                        message = "Records indexed: " + rcdCount;
                        if (outputToSystemOut) {
                            System.out.println(message);
                        }
                        if (outputToMsgQueue) {
                            progressMessageQueue.send(new MessageInput(message));
                        }
                    }
                }
            }
            br.close();
            // handle end-of-file processing
            Document tgtDocument = new Document();
            for (Map.Entry<String, String> entry : mapBreakFields.entrySet()) {
                Field field = new TextField(entry.getKey(), entry.getValue(), Store.YES);
                tgtDocument.add(field);
            }
            for (Map.Entry<String, Map<String, String>> toxEntry : mapToxValues.entrySet()) {
                String fieldName = toxEntry.getKey();
                String fieldValue = GenericUtilities.joinString(toxEntry.getValue().values(), " ");
                // System.out.println(fieldName + ": " + fieldValue);
                sbIndex.append(fieldValue);
                sbIndex.append(" ");
                tgtDocument.add(new TextField(fieldName, fieldValue, Store.YES));
                // build up "Toxicity" taxonomy categories
                for (String value : fieldValue.replace(" ", ",").split(",")) {
                    if (!value.trim().equals("")) {
                        taxonomyCategories.add(new CategoryPath(fieldName, "Toxicity", value));
                    }
                }
                // build up "stats" taxonomy categories
                for (String statsKey : mapStatsFields.keySet()) {
                    if (mapLevelFields.containsKey(statsKey)) {
                        String levelValue = mapLevelFields.get(statsKey);
                        if (!statsKey.trim().equals("") && !levelValue.trim().equals("")) {
                            taxonomyCategories.add(new CategoryPath("Reports", statsKey, levelValue));
                        }
                    }
                }
            }
            tgtDocument.add(new TextField("text", sbIndex.toString().trim(), Store.NO));
            if (taxonomyCategories.size() > 0) {
                facetFields.addFields(tgtDocument, taxonomyCategories);
                // System.out.println("Taxonomies added: " +
                // taxonomyCategories.size());
            }
            indexWriter.addDocument(tgtDocument);
            outCount++;
            message = "Records processed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
            message = "Records indexed: " + outCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sbIndex.setLength(0);
            sbIndex.trimToSize();

            sbLevel.setLength(0);
            sbLevel.trimToSize();

            mapToxValues.clear();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            analyzer.close();
            indexDirectory.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }
        }
        message = "Ended Indexing Reports via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.khepry.frackhem.entities.Toxicities.java

License:Apache License

public void indexViaLucene(String textFilePath, String textColSeparator) throws IOException {

    String message;// ww w . j  a va 2s  . co m

    message = "Start Indexing Toxicities via Lucene...";
    if (outputToSystemOut) {
        System.out.println(message);
    }
    if (outputToMsgQueue) {
        progressMessageQueue.send(new MessageInput(message));
    }

    File textFile = new File(textFilePath);
    if (textFile.exists()) {

        File indexFolder = new File(indexFolderPath);
        if (!indexFolder.exists()) {
            indexFolder.mkdir();
        } else {
            deleteFolder(indexFolder);
            if (!indexFolder.exists()) {
                indexFolder.mkdir();
            }
        }

        File taxonomyFolder = new File(taxonomyFolderPath);
        if (!taxonomyFolder.exists()) {
            taxonomyFolder.mkdir();
        } else {
            deleteFolder(taxonomyFolder);
            if (!taxonomyFolder.exists()) {
                taxonomyFolder.mkdir();
            }
        }

        if (indexFolder.exists() && taxonomyFolder.exists()) {

            List<String> colHeaders = new ArrayList<>();
            Map<String, String> mapIndexFields = new LinkedHashMap<>();
            Map<String, String> mapStatsFields = new LinkedHashMap<>();

            String[] pieces;
            String[] tuples;

            pieces = indexFields.split(",");
            for (String indexField : pieces) {
                mapIndexFields.put(indexField, indexField);
            }

            pieces = statsFields.split(",");
            for (String statField : pieces) {
                tuples = statField.split(":");
                mapStatsFields.put(tuples[0], tuples.length > 1 ? tuples[1] : tuples[0]);
            }

            SimpleFSDirectory indexDirectory = new SimpleFSDirectory(indexFolder);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_44, analyzer);
            IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

            SimpleFSDirectory taxonomyDirectory = new SimpleFSDirectory(taxonomyFolder);
            TaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDirectory, OpenMode.CREATE);
            FacetFields facetFields = new FacetFields(taxonomyWriter);

            List<CategoryPath> taxonomyCategories = new ArrayList<>();

            String line;
            Integer rcdCount = 0;
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(new FileReader(textFile));
            while ((line = br.readLine()) != null) {
                rcdCount++;
                pieces = line.split(textColSeparator);
                if (rcdCount == 1) {
                    for (String colHeader : pieces) {
                        colHeaders.add(colHeader.trim());
                    }
                } else {
                    if (pieces.length == colHeaders.size()) {
                        sb.setLength(0);
                        Document document = new Document();
                        for (int i = 0; i < pieces.length; i++) {
                            Field field = new TextField(colHeaders.get(i), pieces[i].trim(), Store.YES);
                            document.add(field);
                            if (mapIndexFields.containsKey(colHeaders.get(i))) {
                                if (!pieces[i].trim().equals("")) {
                                    sb.append(pieces[i].trim());
                                    sb.append(" ");
                                }
                            }
                        }
                        Field field = new TextField("text", sb.toString().trim(), Store.NO);
                        document.add(field);

                        String toxCasEdfId = pieces[0].trim();
                        String toxChemical = pieces[1].trim();

                        // categorize recognized toxicities
                        String toxRecognized = pieces[2].trim();
                        if (!toxRecognized.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxRecognized", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxRecognized.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("toxRecognized", "Toxicity", value));
                                }
                            }
                        }

                        // categorize suspected toxicities
                        String toxSuspected = pieces[3].trim();
                        if (!toxSuspected.equals("")) {
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "CasEdfId", toxCasEdfId));
                            taxonomyCategories.add(new CategoryPath("toxSuspected", "Chemical",
                                    toxChemical.replace("/", "|")));
                            for (String value : toxSuspected.replace(" ", ",").split(",")) {
                                if (!value.trim().equals("")) {
                                    taxonomyCategories.add(new CategoryPath("toxSuspected", "Toxicity", value));
                                }
                            }
                        }

                        // build up "stats" taxonomy categories
                        for (String statsKey : mapStatsFields.keySet()) {
                            if (mapIndexFields.containsKey(statsKey)) {
                                String fieldValue = mapIndexFields.get(statsKey);
                                if (!statsKey.trim().equals("") && !fieldValue.trim().equals("")) {
                                    taxonomyCategories
                                            .add(new CategoryPath("Toxicities", statsKey, fieldValue));
                                }
                            }
                        }

                        if (taxonomyCategories.size() > 0) {
                            facetFields.addFields(document, taxonomyCategories);
                            // System.out.println("Taxonomies added: " +
                            // taxonomyCategories.size());
                        }

                        indexWriter.addDocument(document);
                        if (progressInterval > 0 && rcdCount % progressInterval == 0) {
                            message = "Records indexed: " + rcdCount;
                            if (outputToSystemOut) {
                                System.out.println(message);
                            }
                            if (outputToMsgQueue) {
                                progressMessageQueue.send(new MessageInput(message));
                            }
                        }

                        taxonomyCategories.clear();
                    }
                }
            }
            br.close();
            message = "Records indexed: " + rcdCount;
            if (outputToSystemOut) {
                System.out.println(message);
            }
            if (outputToMsgQueue) {
                progressMessageQueue.send(new MessageInput(message));
            }

            sb.setLength(0);
            sb.trimToSize();

            indexWriter.commit();
            indexWriter.forceMerge(1);
            indexWriter.close();

            taxonomyWriter.commit();
            taxonomyWriter.close();

            analyzer.close();

            indexDirectory.close();
            taxonomyDirectory.close();
        } else {
            message = "Lucene Index Folder: " + indexFolder + " or Lucene Taxonomy folder: " + taxonomyFolder
                    + " does not exist!";
            if (outputToSystemErr) {
                System.err.println(message);
            }
        }
        message = "Ended Indexing Toxicities via Lucene!";
        if (outputToSystemOut) {
            System.out.println(message);
        }
        if (outputToMsgQueue) {
            progressMessageQueue.send(new MessageInput(message));
        }
    }
}

From source file:com.mathworks.xzheng.indexing.IndexingTest.java

License:Apache License

public void testDeleteAfterOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs());//from w  w  w. j  a va2  s.  c  om
    writer.deleteDocuments(new Term("id", "1"));
    //writer.optimize();                //3
    writer.forceMerge(1);
    writer.commit();
    assertFalse(writer.hasDeletions());
    assertEquals(1, writer.maxDoc()); //C
    assertEquals(1, writer.numDocs()); //C    
    writer.close();
}

From source file:com.mathworks.xzheng.indexing.VerboseIndexing.java

License:Apache License

private void index() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));
    IndexWriter writer = new IndexWriter(dir, config);

    config.setInfoStream(System.out);

    for (int i = 0; i < 100; i++) {
        Document doc = new Document();
        doc.add(new Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED));
        writer.addDocument(doc);//from  www  .ja va2s.  co m
    }
    //writer.optimize();
    writer.forceMerge(Integer.MAX_VALUE);
    writer.close();
}

From source file:com.mathworks.xzheng.searching.ScoreTest.java

License:Apache License

private void indexSingleFieldDocs(Field[] fields) throws Exception {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(directory, config);
    for (Field f : fields) {
        Document doc = new Document();
        doc.add(f);/*from   ww  w  .  j a  va2  s.  c  o  m*/
        writer.addDocument(doc);
    }
    writer.forceMerge(1);
    writer.close();
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat.java

License:Apache License

/** Make sure the final sub-block(s) are not skipped. */
public void testFinalBlock() throws Exception {
    Directory d = newDirectory();/*from   w w  w . ja  v a  2  s.c o m*/
    IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random())));
    for (int i = 0; i < 25; i++) {
        Document doc = new Document();
        doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO));
        doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO));
        w.addDocument(doc);
    }
    w.forceMerge(1);

    DirectoryReader r = DirectoryReader.open(w);
    assertEquals(1, r.leaves().size());
    RocanaFieldReader field = (RocanaFieldReader) r.leaves().get(0).reader().fields().terms("field");
    // We should see exactly two blocks: one root block (prefix empty string) and one block for z* terms (prefix z):
    RocanaStats stats = field.getStats();
    assertEquals(0, stats.floorBlockCount);
    assertEquals(2, stats.nonFloorBlockCount);
    r.close();
    w.close();
    d.close();
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat2.java

License:Apache License

@Override
public void tearDown() throws Exception {
    iw.close();//  w w w  .j ava2s .c  om
    TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    iwc.setOpenMode(OpenMode.APPEND);
    IndexWriter iw = new IndexWriter(dir, iwc);
    iw.forceMerge(1);
    iw.close();
    dir.close(); // just force a checkindex for now
    super.tearDown();
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

public void test() throws Exception {
    Directory dir = newDirectory();//from w  w  w  . j a v  a 2  s  .  com
    Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            if (fieldName.contains("payloadsFixed")) {
                TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
                return new TokenStreamComponents(tokenizer, filter);
            } else if (fieldName.contains("payloadsVariable")) {
                TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
                return new TokenStreamComponents(tokenizer, filter);
            } else {
                return new TokenStreamComponents(tokenizer);
            }
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    // TODO we could actually add more fields implemented with different PFs
    // or, just put this test into the usual rotation?
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsOnlyType.setStoreTermVectors(true);
    docsOnlyType.setIndexOptions(IndexOptions.DOCS);

    FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsAndFreqsType.setStoreTermVectors(true);
    docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn these on for a cross-check
    positionsType.setStoreTermVectors(true);
    positionsType.setStoreTermVectorPositions(true);
    positionsType.setStoreTermVectorOffsets(true);
    positionsType.setStoreTermVectorPayloads(true);
    FieldType offsetsType = new FieldType(positionsType);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    Field field1 = new Field("field1docs", "", docsOnlyType);
    Field field2 = new Field("field2freqs", "", docsAndFreqsType);
    Field field3 = new Field("field3positions", "", positionsType);
    Field field4 = new Field("field4offsets", "", offsetsType);
    Field field5 = new Field("field5payloadsFixed", "", positionsType);
    Field field6 = new Field("field6payloadsVariable", "", positionsType);
    Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
    Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
    doc.add(field1);
    doc.add(field2);
    doc.add(field3);
    doc.add(field4);
    doc.add(field5);
    doc.add(field6);
    doc.add(field7);
    doc.add(field8);
    for (int i = 0; i < MAXDOC; i++) {
        String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ')
                + " " + TestUtil.randomSimpleString(random());
        field1.setStringValue(stringValue);
        field2.setStringValue(stringValue);
        field3.setStringValue(stringValue);
        field4.setStringValue(stringValue);
        field5.setStringValue(stringValue);
        field6.setStringValue(stringValue);
        field7.setStringValue(stringValue);
        field8.setStringValue(stringValue);
        iw.addDocument(doc);
    }
    iw.close();
    verify(dir);
    TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
    iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    iwc.setOpenMode(OpenMode.APPEND);
    IndexWriter iw2 = new IndexWriter(dir, iwc);
    iw2.forceMerge(1);
    iw2.close();
    verify(dir);
    dir.close();
}