Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package start.lucene; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.HashSet; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleField; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.xmlpull.v1.XmlPullParserException; /** * This class is used to index the given database entries (cPeptideEntries) * * Lucene performs inverted-indexing and each numeric value is indexed as a "trie structure" * * @author Sule */ public class CPeptidesIndexer { private IndexWriter indexWriter; private File indexDirPath; // a directory which contains all indexFiles private boolean isConstructed = false; // to check if an object for indexing already constructed private HashSet<StringBuilder> cPeptideEntries = new HashSet<StringBuilder>(); private final FieldType indexedFieldType = new FieldType(); private int totalDoc = 0; // total number of stored indexes /** * This method constructs a CPeptidesIndex object from a given list of * entries and locates index files on given folder * * @param cPeptideEntries a list of StringBuilder information for each * cross-linked peptide entry * @param folder a location where index files are stored */ public CPeptidesIndexer(HashSet<StringBuilder> cPeptideEntries, File folder) { indexDirPath = folder; this.cPeptideEntries = cPeptideEntries; // the default value is 4 but the ideal value in most cases for 64 bit data types (long, double) is 6 or 8. indexedFieldType.setNumericPrecisionStep(6); indexedFieldType.setStored(true); indexedFieldType.setIndexed(true); indexedFieldType.setNumericType(FieldType.NumericType.DOUBLE); } /** * This method returns IndexWriter, by making sure that an object has been * already constructed * * @return * @throws IOException */ public IndexWriter getIndexWriter() throws IOException { if (!isConstructed || indexWriter == null) { // Explanation from Lucene-Package explanation... // Unfortunately, because of system peculiarities, there is no single overall best implementation. // Therefore, we've added the open(java.io.File) method, to allow Lucene to choose the best FSDirectory implementation given your environment, //and the known limitations of each implementation. For users who have no reason to prefer a specific implementation, it's best to simply use open(java.io.File). Directory dir = FSDirectory.open(indexDirPath); // The following is to create an in-memory index: Directory index = new RAMDirectory(); // vairous types of analyzers but only standardAnalyzer is used IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_4, new StandardAnalyzer()); // now each index file is reconstructed from scrath config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); indexWriter = new IndexWriter(dir, config); // disable merging all segments indexWriter.getConfig().setUseCompoundFile(false); isConstructed = true; } return indexWriter; } /** * This method creates a Document of each entry from given cPeptideEntries * and then it adds every Document to an indexWriter. After indexing the * last entry, the current indexWriter is closed. * * @throws FileNotFoundException * @throws IOException * @throws XmlPullParserException */ public void index() throws FileNotFoundException, IOException, XmlPullParserException { for (StringBuilder cPeptideEntry : cPeptideEntries) { Document doc = getDocument(cPeptideEntry); getIndexWriter().addDocument(doc); totalDoc++; } getIndexWriter().close(); } /** * This method prepares a Document from a given line for indexing * * @param line * @return * @throws IOException */ public Document getDocument(StringBuilder line) throws IOException { Document doc = new Document(); int maxDoc = getIndexWriter().maxDoc(), id = maxDoc++; // Fill document like "name-value" pair. doc.add(new IntField(FieldName.ID, id, Field.Store.YES)); // proteinA name String[] sp = line.toString().split("\t"); // Except mass, all is StringField but not TextField because // A text field is a sequence of terms that has been tokenized and punctuation and spacing are ignored-good for keyword search // while a string field is a single term with literal character strings with all punctuation, and cannot tokenized (only for atomic values), spacing,and case preserved // StringField is always indexed since Lucene4.0 // StoredField is for storing but not indexing at all (and so, is not searchable). // StoredField(String name, String value)creates a stored-only field with the given string value doc.add(new StoredField(FieldName.PROTEINA, sp[0])); doc.add(new StoredField(FieldName.PROTEINB, sp[1])); // proteinB name doc.add(new StoredField(FieldName.PEPTIDEA, sp[2])); // peptideA sequence doc.add(new StoredField(FieldName.PEPTIDEB, sp[3])); // peptideB sequence doc.add(new StoredField(FieldName.LINKA, sp[4])); // proteinA name doc.add(new StoredField(FieldName.LINKB, sp[5])); // proteinB name doc.add(new StoredField(FieldName.FIXMODA, sp[6])); // linkerPeptideA doc.add(new StoredField(FieldName.FIXMODB, sp[7])); // linkerPeptideB doc.add(new StoredField(FieldName.VARMODA, sp[8])); // ModificationsPeptideA doc.add(new StoredField(FieldName.VARMODB, sp[9])); // ModificationsPeptideB //doc.add(new StringField("mass", sp[10], Field.Store.YES)); // Mass doc.add(new DoubleField(FieldName.MASS, Double.parseDouble(sp[10]), indexedFieldType)); if (sp.length > 11) { doc.add(new StoredField(FieldName.TYPE, sp[11])); //Type doc.add(new StoredField(FieldName.LABEL, sp[12])); // Labeling-true:Heavylabeled } return doc; } /** * This method returns the total number of stored documents * * @return * @throws IOException * @throws FileNotFoundException * @throws XmlPullParserException */ public int getTotalDoc() throws IOException, FileNotFoundException, XmlPullParserException { if (!isConstructed || indexWriter == null) { getIndexWriter(); index(); } return totalDoc; } public File getIndexDirPath() { return indexDirPath; } public boolean isIsConstructed() { return isConstructed; } public HashSet<StringBuilder> getcPeptideEntries() { return cPeptideEntries; } public FieldType getIndexedFieldType() { return indexedFieldType; } }