Example usage for org.apache.lucene.index IndexWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter commit.

Prototype

@Override
public final long commit() throws IOException

Source Link

Document

Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.

Usage

From source file:fr.ericlab.sondy.algo.eventdetection.MABED.java

License:Open Source License

MABEDTopic getRefinedTopic(MABEDTopic simpleTopic, int nbrelatedTerms) {
    MABEDTopic refinedTopic = new MABEDTopic();
    String[] frequentTerms = new String[nbrelatedTerms];
    try {/*from   www  . j  a va 2 s .c o m*/
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory temporaryIndex = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter temporaryWriter = new IndexWriter(temporaryIndex, config);
        Document doc = new Document();
        doc.add(new Field("content",
                dbAccess.getMessagesAsString(appVariables, simpleTopic.mainTerm, simpleTopic.I.timeSliceA,
                        simpleTopic.I.timeSliceB),
                Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
        temporaryWriter.addDocument(doc);
        temporaryWriter.commit();
        IndexReader temporaryReader = IndexReader.open(temporaryWriter, true);
        TermEnum allTerms = temporaryReader.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (!term.equals(simpleTopic.mainTerm) && term.length() > 1 && !appVariables.isStopWord(term)) {
                int cf = IndexAccess.getTermOccurenceCount(temporaryReader, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > nbrelatedTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size() && i < nbrelatedTerms; i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        temporaryWriter.close();
        temporaryReader.close();
        temporaryIndex.close();

        float ref[] = indexAccess.getTermFrequency(appVariables, simpleTopic.mainTerm);
        float comp[];
        refinedTopic = new MABEDTopic(simpleTopic.mainTerm, simpleTopic.I, simpleTopic.score,
                simpleTopic.anomaly);
        for (int j = 0; j < nbrelatedTerms && frequentTerms[j] != null; j++) {
            comp = indexAccess.getTermFrequency(appVariables, frequentTerms[j]);
            double w = getErdemCoefficient(ref, comp, simpleTopic.I.timeSliceA, simpleTopic.I.timeSliceB);
            if (w >= _THETA_) {
                refinedTopic.relatedTerms.add(new MABEDWeightedTerm(frequentTerms[j], w));
            }
        }
    } catch (IOException ex) {
        Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
    }
    return refinedTopic;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage,
        boolean lemmatization, AppVariables appVariables) {
    try {//from   w w w .jav  a2  s.  c  om
        Connection connection;
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(),
                appVariables.configuration.getUsername(), appVariables.configuration.getPassword());
        Statement statement = connection.createStatement();
        Statement statement2 = connection.createStatement();

        String lemStr = (lemmatization) ? "_lem1" : "_lem0";
        statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName
                + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;");
        //            statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)");

        ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMin.next();
        Timestamp tMin = rsTMin.getTimestamp(1);
        ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMax.next();
        Timestamp tMax = rsTMax.getTimestamp(1);
        Timestamp tRef = new Timestamp(0);
        long base = (tMin.getTime() - tRef.getTime()) * 1L;
        long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L;
        long streamDurationMin = (streamDuration / 1000) / 60;

        String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/"
                + intervalDuration + "min-" + stemLanguage;
        path += (lemmatization) ? "-lem1" : "-lem0";
        path += "-" + ngram + "gram";
        String pathMention = path + "-m";

        FSDirectory indexGlobal = FSDirectory.open(new File(path));
        FSDirectory indexMention = FSDirectory.open(new File(pathMention));
        Analyzer analyzer;
        Properties props = new Properties();
        props.put("annotators", "tokenize,ssplit,parse,lemma");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation annotation;
        if (stemLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            Class cl;
            if (stemLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = stemLanguage.substring(0, 2).toLowerCase();
                cl = Class
                        .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet());
            }
        }
        IndexWriterConfig configGlobal;
        IndexWriterConfig configMention;
        ShingleAnalyzerWrapper shingleAnalyzer = null;
        if (ngram > 1) {
            shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false);
            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
        } else {
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        }
        IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal);
        IndexWriter wMention = new IndexWriter(indexMention, configMention);

        int docId = 0;
        for (int i = 0; i < streamDurationMin; i += intervalDuration) {
            statement = connection.createStatement();
            long infBound = base + i * 60 * 1000L;
            long supBound = base + (i + intervalDuration) * 60 * 1000L;
            Timestamp infTime = new Timestamp(infBound);
            Timestamp supTime = new Timestamp(supBound);
            ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM "
                    + appVariables.configuration.getSchema() + "." + datasetName
                    + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'");
            String globalContent = new String();
            String mentionContent = new String();
            String timestamps = new String();
            NumberFormat formatter = new DecimalFormat("00000000");
            int bulk = 0;
            String bulkString = "";
            boolean mention;
            while (rs.next()) {
                String message = rs.getString(1).toLowerCase();
                mention = message.contains("@");
                if (lemmatization) {
                    annotation = new Annotation(message);
                    message = "";
                    pipeline.annotate(annotation);
                    List<CoreMap> lem = annotation.get(SentencesAnnotation.class);
                    for (CoreMap l : lem) {
                        for (CoreLabel token : l.get(TokensAnnotation.class)) {
                            message += token.get(LemmaAnnotation.class) + " ";
                        }
                    }
                }
                if (ngram > 1) {
                    String processedMessage = "";
                    TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message));
                    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
                    while (tokenStream.incrementToken()) {
                        String termToken = charTermAttribute.toString();
                        if (!termToken.contains("_")) {
                            processedMessage += termToken.replace(" ", "=") + " ";
                        }
                    }
                    message = processedMessage;
                }
                bulk++;
                if (bulk < _BULK_SIZE_) {
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\"),";
                } else {
                    bulk = 0;
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\");";
                    statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                            + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_"
                            + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                            + bulkString);
                    bulkString = "";
                }
                globalContent += message + "\n";
                if (mention) {
                    mentionContent += message + "\n";
                }
                timestamps += rs.getString(2) + "\n";
            }
            if (bulk > 0 && bulkString.length() > 0) {
                statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                        + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                        + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                        + bulkString.substring(0, bulkString.length() - 1) + ";");
            }
            Document docGlobal = new Document();
            docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wGlobal.addDocument(docGlobal);
            wGlobal.commit();
            Document docMention = new Document();
            docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wMention.addDocument(docMention);
            wMention.commit();

            File textFile = new File(path + "/input/" + formatter.format(docId) + ".text");
            FileUtils.writeStringToFile(textFile, globalContent);
            File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time");
            FileUtils.writeStringToFile(timeFile, timestamps);

            docId++;
            statement.close();
        }
        statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (time_slice);");
        statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (msg_text);");
        statement2.close();
        connection.close();
        wGlobal.close();
        wMention.close();
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (NoSuchMethodException | SecurityException | IllegalArgumentException
            | InvocationTargetException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public String[] getFrequentCoocurringTerms(String document, int numTerms, String baseTerm,
        AppVariables appVariables) {/*from   w ww  . ja  v  a  2s  . co  m*/
    String[] frequentTerms = new String[numTerms];
    try {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter w = new IndexWriter(index, config);
        Document doc = new Document();
        doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        w.addDocument(doc);
        w.commit();
        IndexReader r = IndexReader.open(w, true);
        TermEnum allTerms = r.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        StopWords stopWords = appVariables.currentStopWords;
        HashSet<String> stopWordsSet = stopWords.getSet();
        stopWords.add(baseTerm);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !stopWordsSet.contains(term)) {
                float cf = getTermOccurenceCount(r, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > numTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size(); i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        w.close();
        r.close();
        index.close();
    } catch (Exception ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
    return frequentTerms;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public String[] getFrequentCoocurringTermsFromFile(int numTerms, String baseTerm, AppVariables appVariables) {
    String[] frequentTerms = new String[numTerms];
    try {/*from   w  w  w  .  j a v a2s . c om*/
        BufferedReader input = new BufferedReader(new FileReader("tmp.msg"));
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        RAMDirectory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter w = new IndexWriter(index, config);
        String line = "";
        String document = "";
        int count = 0;
        while ((line = input.readLine()) != null) {
            count++;
            document += line;
            if (count == 2000) {
                Document doc = new Document();
                doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED,
                        Field.TermVector.YES));
                w.addDocument(doc);
                w.commit();
                count = 0;
                document = "";
            }
        }
        Document doc = new Document();
        doc.add(new Field("content", document, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        w.addDocument(doc);
        w.commit();
        input.close();
        IndexReader r = IndexReader.open(w, true);
        TermEnum allTerms = r.terms();
        int minFreq = 0;
        TermInfoList termList = new TermInfoList();
        StopWords stopWords = appVariables.currentStopWords;
        HashSet<String> stopWordsSet = stopWords.getSet();
        stopWords.add(baseTerm);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !stopWordsSet.contains(term)) {
                float cf = getTermOccurenceCount(r, term);
                if (cf > minFreq) {
                    termList.addTermInfo(new TermInfo(term, (int) cf));
                    termList.sortList();
                    if (termList.size() > numTerms) {
                        termList.removeLast();
                    }
                    minFreq = termList.get(termList.size() - 1).occurence;
                }
            }
        }
        for (int i = 0; i < termList.size(); i++) {
            frequentTerms[i] = termList.get(i).text;
        }
        w.close();
        r.close();
        index.close();
    } catch (Exception ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
    return frequentTerms;
}

From source file:fr.univ_tours.etu.searcher.LikeThisTest.java

public void writerEntries() throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter indexWriter = new IndexWriter(indexDir, config);

    Document doc1 = createDocument("1", "doduck", "prototype your idea");
    Document doc2 = createDocument("2", "doduck", "love programming");
    Document doc3 = createDocument("3", "We do", "prototype");
    Document doc4 = createDocument("4", "We love", "challange");
    indexWriter.addDocument(doc1);//ww  w  . j a  va  2  s.  c  om
    indexWriter.addDocument(doc2);
    indexWriter.addDocument(doc3);
    indexWriter.addDocument(doc4);

    indexWriter.commit();
    indexWriter.forceMerge(100, true);
    indexWriter.close();
}

From source file:framework.retrieval.engine.context.RFacade.java

License:Apache License

public void closeIndexWriter(IndexWriter indexWriter) {
    if (indexWriter != null) {
        try {//from  w w  w.j  a  va  2 s .  co  m
            indexWriter.commit();
        } catch (Exception e) {
            e.printStackTrace();
        }
        try {
            indexWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:framework.retrieval.engine.index.create.impl.RIndexWriter.java

License:Apache License

/**
 * ,,??//  w w w .j av  a 2 s.c o  m
 * @param documents
 */
public void addDocument(List<Document> documents) {
    if (indexPathType == null) {
        throw new RetrievalDocumentException("indexPathType ??null");
    }

    if (documents == null || documents.size() <= 0) {
        return;
    }
    int length = documents.size();
    RetrievalUtil.debugLog(log, "  " + indexPathType + "  " + length + " ? ");

    //      RetrievalIndexLock.getInstance().lock(indexPathType);
    IndexWriter indexWriter = null;
    RIndexWriterWrap ramIndexWriterWrap = indexWriteProvider.createRamIndexWriter();
    try {
        for (int i = 0; i < length; i++) {
            Document document = documents.get(i);
            ramIndexWriterWrap.getIndexWriter().addDocument(document);
        }

        ramIndexWriterWrap.getIndexWriter().commit();
        indexWriter = getIndexWriter(indexPathType);
        indexWriter.addIndexesNoOptimize(new Directory[] { ramIndexWriterWrap.getDirectory() });

    } catch (Exception e) {
        throw new RetrievalDocumentException(e);
    } finally {
        if (ramIndexWriterWrap != null) {
            try {
                ramIndexWriterWrap.close();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
        }
        if (indexWriter != null) {
            try {
                indexWriter.commit();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
            try {
                indexWriter.close();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
        }
        //         RetrievalIndexLock.getInstance().unlock(indexPathType);
    }
}

From source file:framework.retrieval.engine.index.create.impl.RIndexWriter.java

License:Apache License

/**
 * ,/*w  w w  . j  a  va 2  s.  co  m*/
 * @param documents
 */
public void addDocumentNowRamSupport(List<Document> documents) {
    if (indexPathType == null) {
        throw new RetrievalDocumentException("indexPathType ??null");
    }

    if (documents == null || documents.size() <= 0) {
        return;
    }
    int length = documents.size();
    RetrievalUtil.debugLog(log, "  " + indexPathType + "  " + length + " ? ");

    //      RetrievalIndexLock.getInstance().lock(indexPathType);
    IndexWriter indexWriter = null;
    indexWriter = getIndexWriter(indexPathType);
    try {
        for (int i = 0; i < length; i++) {
            Document document = documents.get(i);
            indexWriter.addDocument(document);
        }

    } catch (Exception e) {
        throw new RetrievalDocumentException(e);
    } finally {
        if (indexWriter != null) {
            try {
                indexWriter.commit();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
            try {
                indexWriter.close();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
        }
        //         RetrievalIndexLock.getInstance().unlock(indexPathType);
    }
}

From source file:framework.retrieval.engine.index.create.impl.RIndexWriter.java

License:Apache License

/**
 * /*from   w  w  w  .j a  va 2  s .c  om*/
 * @param document
 */
public void addDocument(Document document) {
    if (indexPathType == null) {
        throw new RetrievalDocumentException("indexPathType ??null");
    }

    //      RetrievalIndexLock.getInstance().lock(indexPathType);
    IndexWriter indexWriter = null;
    indexWriter = getIndexWriter(indexPathType);
    try {
        RetrievalUtil.debugLog(log, "  " + indexPathType + " ?");

        indexWriter.addDocument(document);
    } catch (Exception e) {
        throw new RetrievalDocumentException(e);
    } finally {
        if (indexWriter != null) {
            try {
                indexWriter.commit();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
            try {
                indexWriter.close();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
        }
        //         RetrievalIndexLock.getInstance().unlock(indexPathType);
    }
}

From source file:framework.retrieval.engine.index.create.impl.RIndexWriter.java

License:Apache License

/**
 * /*from w ww  . j a va 2s. co m*/
 * @param indexPathType
 * @param term
 */
public void deleteDocument(String indexPathType, Term term) {

    //      RetrievalIndexLock.getInstance().lock(indexPathType);

    IndexWriter indexWriter = null;
    try {
        RetrievalUtil.debugLog(log, "" + indexPathType);
        try {
            indexWriter = getIndexWriter(indexPathType);
            indexWriter.deleteDocuments(term);
        } catch (Exception e) {
            throw new RetrievalDocumentException(e);
        }
    } finally {
        if (indexWriter != null) {
            try {
                indexWriter.commit();
            } catch (Exception e) {
                e.printStackTrace();
            }
            try {
                indexWriter.close();
            } catch (Exception e) {
                RetrievalUtil.errorLog(log, e);
            }
        }
        //         RetrievalIndexLock.getInstance().unlock(indexPathType);
    }
}