Java tutorial
/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.parser.incremental; import java.io.IOException; import java.nio.file.Paths; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Bits; import de.dfki.inquisition.text.StringUtils; import de.dfki.km.leech.config.CrawlerContext; /** * A persistent history database, remarking everything that was processed during a crawl. This history makes it possible to fulfill incremental crawling, where you can * quickly check whether a data entity found during the crawl is new or modified with respect to the last crawl. Further, all data entities that was removed since the * last crawl can be determined for final synchronization.<br> * To check whether a file is new or modified, IncrementalCrawlingHistory needs two informations: a 'data entity exists ID', which is an identifier for a data entity that * is independent from the content of this entity. It is only for identifying the existence, not to check whether it has changed. A 'data entity content fingerprint' * gives the hint whether the content of the data entity has changed. This e.g. can be the modifed date of a file, or a mail header hash.<br> * To determine the data entities that were removed since the last crawl, IncrementalCrawlingHistory remarks the crawl starting time, and updates a 'last crawled/checked * time' entry for every data entity. When the crawl is finished, every data entity which 'last crawled/checked time' is before the remarked crawl starting time is * considered as outdated and thus as removed.<br> * This is an easy, intuitive, general approach that should work for almost all possible data entities. Other approaches stores e.g. parent/child relationships of data * entities, maintain resulting relationship lists, and infer whether an entity was deleted or not. These approaches have the advantage that you can determine, in some * cases, immediately by crawling a container data source whether a data entity was deleted or not, before the recursive call. Nevertheless, where this is easy in e.g. * file system data sources, in other scenarios as web crawlers this is much more complicated, where a link can be potentially part of several 'container websites'.<br> * The timestamp-approach we choose is much easier and works in all scenarios with the same conditions, but has 2 disadvantages against the other approaches:<br> * <li>You have to update every data entity history entry on every crawl with the new 'last crawled/checked time', even if the entity has not changed at all.<br> <li>The * information which data entities were removed can be determined only at the end of a crawl, for the whole history. <br> * <br> * We realized this crawling history with an underlying Lucene index. * * <br> * To enable incremental indexing during a crawl, pass a CrawlerConfig instance with a path to the history into the ParseContext parameter of the Leech.parse(..) method:<br> * <code> * Leech leech = new Leech();<br> * Metadata metadata = new Metadata();<br> * {@link CrawlerContext} crawlerContext = new {@link CrawlerContext}().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br> * leech.parse(new File("resource"), new PrintlnContentHandler(metadata), crawlerContext.createParseContext());<br> * </code> <br> * Make sure that you always use the according history for a specific crawling source - this is a 1:1 relationship, you can't mix. Otherwise, all new stuff will be * considered as new, and all old stuff as deleted. * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class IncrementalCrawlingHistory { protected class CrawlFinishedIterator implements Iterator<String> { protected LinkedList<String> m_llQueuedOutdatedIDs = new LinkedList<String>(); protected Query m_query = null; protected CrawlFinishedIterator() throws IOException { if (m_lCrawlStartingTime == null) throw new IllegalStateException("No crawl starting time found. Did you invoke crawlStarted?"); m_query = LongPoint.newRangeQuery(lastCrawledTime, 0l, m_lCrawlStartingTime - 1); } @Override public boolean hasNext() { try { // soo - hier stellen wir die Suchfrage, ob noch outdated entities vorhanden sind - wenn wir nicht noch welche in der queue von der // letzten Anfrage haben. Wenn wir false zurck geben, dann machen wir die ganzen Lucene-Teile zu. if (m_query == null) return false; // wenn wir nix mehr haben, dann stellen wir eine Suchanfrage if (m_llQueuedOutdatedIDs.size() == 0) { refreshIndexReaderz(); TopDocs topDocs = m_indexSearcher.search(m_query, 5000); Bits liveDocs = MultiFields.getLiveDocs(m_indexReader); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // skip deleted documents if (liveDocs != null && !liveDocs.get(scoreDoc.doc)) continue; Document doc4Queue = m_indexReader.document(scoreDoc.doc, Collections.singleton(dataEntityId)); m_llQueuedOutdatedIDs.add(doc4Queue.get(dataEntityId)); } // wenn die queue immer noch leer ist, dann gibts nix mehr if (m_llQueuedOutdatedIDs.size() == 0) { // alles zu - wir sind fertig closeLuceneStuff(); return false; } } return true; } catch (IOException e) { Logger.getLogger(IncrementalCrawlingHistory.CrawlFinishedIterator.class.getName()).log(Level.SEVERE, "Error", e); return false; } } @Override public String next() { // hier geben wir die id zurck und lschen sie vorher aus der queue und dem index try { if (m_llQueuedOutdatedIDs.isEmpty()) return null; m_indexWriter.deleteDocuments(new Term(dataEntityId, m_llQueuedOutdatedIDs.getFirst())); return m_llQueuedOutdatedIDs.poll(); } catch (Exception e) { Logger.getLogger(IncrementalCrawlingHistory.CrawlFinishedIterator.class.getName()).log(Level.SEVERE, "Error", e); } return null; } @Override public void remove() { throw new UnsupportedOperationException(); } } /** * Defines the states whether a data entity is in the history or not. There are three states: Exist.NOT says that the data entity has no entry inside the history at * all. Exist.YES_UNPROCESSED means that the entity has an entry inside the history, and that it still wasn't processed during the current crawl. Exist.YES_PROCESSED * means that there is an entry but the data entity was processed in this run yet, so normally another processing is unnecessary. This is to detect cycles. * * @author Christian Reuschling, Dipl.Ing.(BA) */ public enum Exist { NOT, YES_PROCESSED, YES_UNPROCESSED } static public final String dataEntityContentFingerprint = "dataEntityContentFingerprint"; static public final String dataEntityId = "dataEntityId"; static public final String masterDataEntityId = "masterDataEntityId"; static public final String lastCrawledTime = "lastCrawledTime"; protected DirectoryReader m_indexReader = null; protected IndexSearcher m_indexSearcher = null; protected IndexWriter m_indexWriter = null; protected Long m_lCrawlStartingTime = null; protected final String m_strHistoryPath; public IncrementalCrawlingHistory(String strHistoryPath) { m_strHistoryPath = strHistoryPath; Runtime.getRuntime() .addShutdownHook(new Thread("IncrementalCrawlingHistory shutdown hook for " + strHistoryPath) { @Override public void run() { try { closeLuceneStuff(); } catch (IOException e) { Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e); } } }); } /** * Remarks a new data entity, together with the current time as 'last crawled/checked time'. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of * a file * * @throws IOException * @throws CorruptIndexException */ public void addDataEntity(String strDataEntityId, String strDataEntityContentFingerprint) throws CorruptIndexException, IOException { addDataEntity(strDataEntityId, strDataEntityContentFingerprint, null); } /** * Remarks a new data entity, together with the current time as 'last crawled/checked time'. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of * a file * @param strMasterDataEntityId optional: an EntityId of another data entity that is our 'master' which means that when the master is updated with * {@link #updateDataEntityLastCrawledTime(String)}, all associated slaves will be also updated. This is e.g. for the case when you are in a second run for * RSS-File indexing, and leech recognizes that this file didn't changed. Now we don't want to go unnecessarily into the fil and mark each entry on it's * own. We know no subentry has changed, and can immediately mark them as processed with {@link #updateDataEntityLastCrawledTime(String)} on the master * dataEntityId, which is the one from the RSS file. Leave it null or empty in the case you don't need to use it. * * @throws IOException * @throws CorruptIndexException */ public void addDataEntity(String strDataEntityId, String strDataEntityContentFingerprint, String strMasterDataEntityId) throws CorruptIndexException, IOException { Document doc = new Document(); doc.add(new StringField(dataEntityId, strDataEntityId, Store.YES)); doc.add(new StringField(dataEntityContentFingerprint, strDataEntityContentFingerprint, Store.YES)); doc.add(new LongPoint(lastCrawledTime, System.currentTimeMillis())); doc.add(new StoredField(lastCrawledTime, System.currentTimeMillis())); if (!StringUtils.nullOrWhitespace(strMasterDataEntityId)) doc.add(new StringField(masterDataEntityId, strMasterDataEntityId, Store.YES)); m_indexWriter.addDocument(doc); } public void closeLuceneStuff() throws IOException { if (m_indexSearcher != null) { m_indexSearcher = null; } if (m_indexReader != null) { m_indexReader.close(); m_indexReader = null; } if (m_indexWriter != null) { m_indexWriter.commit(); m_indexWriter.close(); m_indexWriter = null; } } /** * Returns all DataEntityIds with a 'last crawled/checked time' before the 'crawl starting time' as outdated data entities. These are all entities that doesn't * exist in this crawl anymore, and thus can be considered as removed.<br> * You can only invoke and walk to the iterator once - while iterating, the outdated entries inside the history will be deleted. In the case you invoke this method * twice, the second invocation will result into an empty list. This is to ensure that also huge deleted entity lists can be handled without problematic memory * consumption.<br> * Remark: The writer and reader instance for the underlying lucene index will be closed when you walk the iterator to the end, all data will be committed before. * * @return all DataEntityIds with a 'last crawled/checked time' before the 'crawl starting time', thus all entities that can be considered as removed. */ public Iterator<String> crawlFinished() { try { return new CrawlFinishedIterator(); } catch (IOException e) { Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e); return null; } } /** * Informs the history that a new crawl has started. The history will save the current time as 'crawl starting time'. <br> * Remark: The writer and reader instance for the underlying lucene index will be opened if necessary * * @throws IOException * @throws LockObtainFailedException * @throws CorruptIndexException */ public void crawlStarted() throws CorruptIndexException, LockObtainFailedException, IOException { openLuceneStuff(); // wir merken uns die aktuelle crawlStartingTime - diese wird in CrawlFinished gebraucht, um die outdated entities zu ermitteln. m_lCrawlStartingTime = System.currentTimeMillis(); } /** * Checks whether an ID exists inside the incremental crawling history or not. During the crawl, this is to identify quickly whether a data entity is completely new * or not. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * * @return There are three states: Exist.NOT says that the data entity has no entry inside the history at all. Exist.YES_UNPROCESSED means that the entity has an * entry inside the history, and that it still wasn't processed during the current crawl. Exist.YES_PROCESSED means that there is an entry but the data entity * was processed in this run yet, so normally another processing is unnecessary. This is to detect cycles. * * @throws IOException */ public Exist exists(String strDataEntityId) throws IOException { Long lDataEntityLastCrawledTime = getDataEntityLastCrawledTime(strDataEntityId); if (lDataEntityLastCrawledTime == null) return Exist.NOT; if (lDataEntityLastCrawledTime >= m_lCrawlStartingTime) return Exist.YES_PROCESSED; return Exist.YES_UNPROCESSED; } /** * Checks whether an ID with a specific content fingerprint exists in the crawling history or not. During the crawl, this is to identify quickly whether a data entity * has changed its content or not. Of course, this makes only sense in the case the content fingerprint that gives the hint whether the entity has changed can be * created quickly, at best without extracting the content. Such a fingerprint can be e.g. a modified date of a file, or the time attribute of an email. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of * a file * * @return true in the case this identifier exists with exact this content fingerprint inside the crawling history * * @throws IOException */ public boolean existsWithContent(String strDataEntityId, String strDataEntityContentFingerprint) throws IOException { if (StringUtils.nullOrWhitespace(strDataEntityId)) return false; BooleanQuery query = (new BooleanQuery.Builder()) .add(new TermQuery(new Term(dataEntityId, strDataEntityId)), Occur.MUST) .add(new TermQuery(new Term(dataEntityContentFingerprint, strDataEntityContentFingerprint)), Occur.MUST) .build(); TotalHitCountCollector collector = new TotalHitCountCollector(); refreshIndexReaderz(); m_indexSearcher.search(query, collector); if (collector.getTotalHits() > 0) return true; return false; } /** * Gets the stored content fingerprint for a given data entity entry. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * * @return the according content fingerprint stored for this data entity, null in the case this data entity was not found * * @throws IOException */ public String getDataEntityContentFingerprint(String strDataEntityId) throws IOException { if (StringUtils.nullOrWhitespace(strDataEntityId)) return null; Term termId = new Term(dataEntityId, strDataEntityId); refreshIndexReaderz(); TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1); if (topDocs.totalHits == 0) return null; Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc, Collections.singleton(dataEntityContentFingerprint)); return doc.get(dataEntityContentFingerprint); } /** * Gets the stored last crawled time for a given data entity entry. This can be used to e.g. determine whether a data entity was already processed during the current * crawl or not. If it was processed already, this is a hint for a cycle. * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * * @return the according last crawled time stored for this data entity, null in the case this data entity was not found * * @throws IOException */ public Long getDataEntityLastCrawledTime(String strDataEntityId) throws IOException { if (StringUtils.nullOrWhitespace(strDataEntityId)) return null; Term termId = new Term(dataEntityId, strDataEntityId); refreshIndexReaderz(); TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1); if (topDocs.totalHits == 0) return null; Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc, Collections.singleton(lastCrawledTime)); return Long.valueOf(doc.get(lastCrawledTime)); } /** * Gets the path to this history * * @return the path to this history */ public String getHistoryPath() { return m_strHistoryPath; } /** * Creates all writer, reader, and searcher objects if necessary * * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ public void openLuceneStuff() throws CorruptIndexException, LockObtainFailedException, IOException { if (m_indexWriter == null) { IndexWriterConfig config = new IndexWriterConfig(new KeywordAnalyzer()); config.setOpenMode(OpenMode.CREATE_OR_APPEND); m_indexWriter = new IndexWriter(new SimpleFSDirectory(Paths.get(m_strHistoryPath)), config); } if (m_indexReader == null) m_indexReader = DirectoryReader.open(m_indexWriter, true, true); if (m_indexSearcher == null) m_indexSearcher = new IndexSearcher(m_indexReader); } protected void refreshIndexReaderz() { try { DirectoryReader newReader = DirectoryReader.openIfChanged(m_indexReader); if (newReader != null) { m_indexReader.close(); m_indexReader = newReader; m_indexSearcher = new IndexSearcher(m_indexReader); } } catch (IOException e) { Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e); } } /** * Updates a whole data entity - same as addDataEntity, but removes a former entry before storing the new one * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of * a file * * @throws IOException * @throws CorruptIndexException */ public void updateDataEntity(String strDataEntityId, String strDataEntityContentFingerprint) throws CorruptIndexException, IOException { updateDataEntity(strDataEntityId, strDataEntityContentFingerprint, null); } /** * Updates a whole data entity - same as addDataEntity, but removes a former entry before storing the new one * * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to * check whether it has changed (e.g. a filename) * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of * a file * @param strMasterDataEntityId optional: an EntityId of another data entity that is our 'master' which means that when the master is updated with * {@link #updateDataEntityLastCrawledTime(String)}, all associated slaves will be also updated. This is e.g. for the case when you are in a second run for * RSS-File indexing, and leech recognizes that this file didn't changed. Now we don't want to go unnecessarily into the fil and mark each entry on it's * own. We know no subentry has changed, and can immediately mark them as processed with {@link #updateDataEntityLastCrawledTime(String)} on the master * dataEntityId, which is the one from the RSS file. Leave it null or empty in the case you don't need to use it. * * @throws IOException * @throws CorruptIndexException */ public void updateDataEntity(String strDataEntityId, String strDataEntityContentFingerprint, String strMasterDataEntityId) throws CorruptIndexException, IOException { Term termId = new Term(dataEntityId, strDataEntityId); Document doc = new Document(); doc.add(new StringField(dataEntityId, strDataEntityId, Store.YES)); doc.add(new StringField(dataEntityContentFingerprint, strDataEntityContentFingerprint, Store.YES)); doc.add(new LongPoint(lastCrawledTime, System.currentTimeMillis())); doc.add(new StoredField(lastCrawledTime, System.currentTimeMillis())); if (!StringUtils.nullOrWhitespace(strMasterDataEntityId)) doc.add(new StringField(masterDataEntityId, strMasterDataEntityId, Store.YES)); m_indexWriter.updateDocument(termId, doc); } /** * Sets a data entities 'last crawled/checked time' entry to the current time. In the case this data entity is a master entity, all slave documents will be updated * also. You can set an entity as a master entity with {@link #addDataEntity(String, String, String)} or {@link #updateDataEntity(String, String, String)} * * @param strDataEntityId the data entity which is finally checked/crawled * * @throws IOException * @throws CorruptIndexException */ public void updateDataEntityLastCrawledTime(String strDataEntityId) throws CorruptIndexException, IOException { Term termId = new Term(dataEntityId, strDataEntityId); refreshIndexReaderz(); TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1); if (topDocs.totalHits == 0) throw new IllegalStateException("there has to be an data entry with Id " + strDataEntityId + " for updating. Nothing was found."); long lCurrentTime = System.currentTimeMillis(); Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc); doc.removeFields(lastCrawledTime); doc.add(new LongPoint(lastCrawledTime, lCurrentTime)); doc.add(new StoredField(lastCrawledTime, lCurrentTime)); m_indexWriter.updateDocument(termId, doc); // wenn das Teil eine MasterDataEntity ist, dann mssen alle assoziierten Sklaven auch noch aktualisiert werden termId = new Term(masterDataEntityId, strDataEntityId); topDocs = m_indexSearcher.search(new TermQuery(termId), Integer.MAX_VALUE); for (int i = 0; i < topDocs.scoreDocs.length; i++) { Document slaveDoc = m_indexReader.document(topDocs.scoreDocs[i].doc); slaveDoc.removeFields(lastCrawledTime); slaveDoc.add(new LongPoint(lastCrawledTime, lCurrentTime)); slaveDoc.add(new StoredField(lastCrawledTime, lCurrentTime)); m_indexWriter.updateDocument(termId, doc); } } }