Java tutorial
/** * Copyright 2011, Campinas Stephane * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /** * @project trec-entity-tool * @author Campinas Stephane [ 3 Jun 2011 ] * @link stephane.campinas@deri.org */ package edu.udel.ece.infolab.btc; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import org.sindice.siren.analysis.TupleAnalyzer; import org.sindice.siren.analysis.TupleAnalyzer.URINormalisation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Index a list of entities, creating incoming, outgoing triples fields, subject * and type fields. The type field is a grouping of the rdf:type objects for this * entity.<br> * Outgoing triples are stored as n-tuples where a predicate has all its related * values. * Incoming triples are also stored as n-tuples, the difference being that a * predicate possess its related subject URIs. */ public abstract class Indexing { protected final Logger logger = LoggerFactory.getLogger(Indexing.class); /* Perform a commit by batch of COMMIT documents */ public static int COMMIT = 10000; public static boolean STORE = true; public static int SKIP_TO = 0; public static int PROCESS_BATCH = 100000; // FIELDS final static public String INCOMING_TRIPLE = "incoming-triple"; final static public String OUTGOING_TRIPLE = "outgoing-triple"; final static public String SUBJECT = "subject"; final static public String TYPE = "type"; /* The data set files */ protected final File[] input; protected int inputPos = 0; /* The current reader into the compressed archive */ protected GZIPInputStream reader = null; protected BufferedReader _rin = null; /* the triple hash for the current archive file */ protected Hashtable<String, HashSet<String>> _outTripleHash = null; protected Hashtable<String, HashSet<String>> _inTripleHash = null; protected StringBuilder sb = new StringBuilder(); /* the current entity */ private final Entity _entity = new Entity(); Pattern _clearPattern = Pattern.compile("[<>]"); /* SIREn index */ protected final Directory indexDir; protected final IndexWriter writer; /** * Create a SIREn index at indexDir, taking the files at inputDir as input. * @param inputDir * @param dir * @throws IOException */ public Indexing(final File inputDir, final Directory dir) throws IOException { this.input = inputDir.listFiles(new FilenameFilter() { //@Override public boolean accept(File dir, String name) { return true; } }); /* * Sort by filename: important because in the SIndice-ED dataset, two * consecutive dumps can store a same entity */ Arrays.sort(this.input); if (this.input.length == 0) { throw new RuntimeException("No archive files in the folder: " + inputDir.getAbsolutePath()); } this.indexDir = dir; this.writer = initializeIndexWriter(this.indexDir); reader = getInputStream(this.input[0]); _rin = new BufferedReader(new InputStreamReader(reader)); logger.info("Creating index from input located at {} ({} files)", inputDir.getAbsolutePath(), input.length); logger.info("Reading dump: {}", this.input[0]); _inTripleHash = new Hashtable<String, HashSet<String>>(); _outTripleHash = new Hashtable<String, HashSet<String>>(); } /** * The regular expression of the input files * @return */ protected abstract String getPattern(); /** * Create a buffered tar inputstream from the file in * @param in * @return * @throws FileNotFoundException * @throws IOException */ private GZIPInputStream getInputStream(final File in) throws FileNotFoundException, IOException { return new GZIPInputStream(new FileInputStream(in)); } /** * Move to the next tar entry. * @return true if a next tar entry can be read, or if this entry name is a sub-folder of rootDir */ public boolean moveToNextArchive() { try { if (++inputPos >= input.length) { reader.close(); return false; } // Next archive file reader.close(); reader = getInputStream(input[inputPos]); _rin = new BufferedReader(new InputStreamReader(reader)); logger.info("Reading dump: {}", this.input[inputPos]); } catch (IOException e) { logger.error("Error while reading the input: {}\n{}", input[inputPos], e); } _inTripleHash.clear(); _outTripleHash.clear(); return true; } /** * Create a index writer that uses a #TupleAnalyzer on the triples fields with * a tokenization of the URI's localname, and the default #WhitespaceAnalyzer * on the others. * @param dir * @return * @throws IOException */ @SuppressWarnings("deprecation") private IndexWriter initializeIndexWriter(final Directory dir) throws IOException { final Analyzer defaultAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_31); final Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>(); final TupleAnalyzer tuple = new TupleAnalyzer(new StandardAnalyzer(Version.LUCENE_31)); tuple.setURINormalisation(URINormalisation.LOCALNAME); fieldAnalyzers.put(OUTGOING_TRIPLE, tuple); fieldAnalyzers.put(INCOMING_TRIPLE, tuple); final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers)); // Disable compound file ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(false); // Increase merge factor to 20 - more adapted to batch creation ((LogMergePolicy) config.getMergePolicy()).setMergeFactor(20); config.setRAMBufferSizeMB(256); config.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); config.setMaxBufferedDeleteTerms(IndexWriterConfig.DISABLE_AUTO_FLUSH); final IndexWriter writer = new IndexWriter(dir, config); writer.setMaxFieldLength(Integer.MAX_VALUE); return writer; } private String HashSetToString(final HashSet<String> set) { sb.setLength(0); for (String s : set) { sb.append(s); } return sb.toString(); } /** * Creates an entity index * @throws CorruptIndexException * @throws IOException */ public void indexIt() throws CorruptIndexException, IOException { long counter = 0; do { while (true == parseOneBatch()) { counter = processOneBatch(counter); } counter = processOneBatch(counter); } while (moveToNextArchive()); commit(false, counter, _entity.subject); // Commit what is left writer.optimize(); } private long processOneBatch(long counter) throws CorruptIndexException, IOException { Enumeration<String> em = _outTripleHash.keys(); while (em.hasMoreElements()) { _entity.clear(); String subject = em.nextElement(); String outTriples = HashSetToString(_outTripleHash.get(subject)); Utils.sortAndFlattenNTriples(outTriples, _entity.outTuples, _entity.type, true); if (_inTripleHash.containsKey(subject)) { String inTriples = HashSetToString(_inTripleHash.get(subject)); Utils.sortAndFlattenNTriples(inTriples, _entity.inTuples, _entity.type, false); } Document doc = new Document(); doc.add(new Field(SUBJECT, trim(subject), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(TYPE, Utils.toString(_entity.type), Store.YES, Index.ANALYZED_NO_NORMS)); doc.add(new Field(OUTGOING_TRIPLE, _entity.getTriples(true), Store.YES, Index.ANALYZED_NO_NORMS)); doc.add(new Field(INCOMING_TRIPLE, _entity.getTriples(false), Store.YES, Index.ANALYZED_NO_NORMS)); writer.addDocument(doc); counter = commit(true, counter, subject); } return counter; } /** * Commits the documents by batch * @param indexing * @param counter * @param subject * @return * @throws CorruptIndexException * @throws IOException */ private long commit(boolean indexing, long counter, String subject) throws CorruptIndexException, IOException { if (!indexing || (++counter % COMMIT) == 0) { // Index by batch writer.commit(); logger.info("Commited {} entities.", (indexing ? COMMIT : counter)); } return counter; } /** * Close resources * @throws CorruptIndexException * @throws IOException */ public void close() throws CorruptIndexException, IOException { try { writer.close(); } finally { indexDir.close(); } } private boolean isURI(String uri) { try { Matcher m = _clearPattern.matcher(uri); uri = m.replaceAll(""); new URL(uri); return true; } catch (MalformedURLException e) { return false; } } /* * trim string */ private String trim(String str) { Matcher m = _clearPattern.matcher(str); str = m.replaceAll(""); return str; } /* * parse one batch of an archive file */ protected boolean parseOneBatch() { StringBuilder tripleSb = new StringBuilder(); _inTripleHash.clear(); _outTripleHash.clear(); int parsed = 0; try { String line; String sub; String obj; int pos = 1; while ((line = _rin.readLine()) != null) { String[] fields = line.split("\\s"); if (fields.length < 3) { System.err.println("Invalid record of at N Triple" + pos); continue; } sub = fields[0]; if (false == isURI(sub)) { continue; } // transfer data from nQuard to nTriple tripleSb.setLength(0); for (int i = 0; i < fields.length - 2; ++i) { tripleSb.append(fields[i]).append(' '); } tripleSb.append(".\n"); // save outgoing triple if (_outTripleHash.containsKey(sub)) { HashSet<String> triple = _outTripleHash.get(sub); triple.add(tripleSb.toString()); _outTripleHash.put(sub, triple); } else { HashSet<String> triple = new HashSet<String>(); triple.add(tripleSb.toString()); _outTripleHash.put(sub, triple); } // only if the object field is a URI, it can be treated as incoming triples as well if (5 == fields.length) { obj = fields[2]; if (true == isURI(obj)) { // save incoming triple if (_inTripleHash.containsKey(obj)) { HashSet<String> triple = _inTripleHash.get(obj); triple.add(tripleSb.toString()); _inTripleHash.put(obj, triple); } else { HashSet<String> triple = new HashSet<String>(); triple.add(tripleSb.toString()); _inTripleHash.put(obj, triple); } } } ++pos; if (++parsed > PROCESS_BATCH) { break; } } if (parsed > PROCESS_BATCH) { return true; } else { _rin.close(); return false; } } catch (IOException e) { System.err.println("I/O Error" + e); e.printStackTrace(); System.exit(-1); } catch (Exception e) { System.err.println("Error" + e); e.printStackTrace(); System.exit(-1); } return false; } }