Java tutorial
/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.unstructured; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.CorruptIndexException; import com.bizosys.hsearch.byteutils.SortedBytesInteger; import com.bizosys.hsearch.hbase.HWriter; import com.bizosys.hsearch.hbase.NV; import com.bizosys.hsearch.hbase.RecordScalar; import com.bizosys.hsearch.idsearch.util.IdSearchLog; import com.bizosys.hsearch.treetable.client.partition.IPartition; import com.bizosys.hsearch.treetable.storage.HBaseTableSchemaDefn; import com.bizosys.hsearch.treetable.unstructured.IIndexFrequencyTable; import com.bizosys.hsearch.treetable.unstructured.IIndexMetadataFlagTable; import com.bizosys.hsearch.treetable.unstructured.IIndexMetadataFrequencyTable; import com.bizosys.hsearch.treetable.unstructured.IIndexOffsetTable; import com.bizosys.hsearch.treetable.unstructured.IIndexPositionsTable; import com.bizosys.hsearch.util.Hashing; public class IndexWriter { private static String unknownDocumentType = "-"; private static final int FREQUENCY_TABLE = 0; private static final int OFFSET_TABLE = 1; private static final int POSITION_TABLE = 2; private static final int DOCMETA_FREQUENCY_TABLE = 4; private static final int DOCMETA_FLAG_TABLE = 5; private int tableType = -1; static boolean INFO_ENABLED = IdSearchLog.l.isInfoEnabled(); private List<IndexRow> cachedIndex = new ArrayList<IndexWriter.IndexRow>(); private IIndexFrequencyTable tableFrequency = null; private IIndexOffsetTable tableOffset = null; private IIndexPositionsTable tablePositions = null; private IIndexMetadataFrequencyTable tableDocMetaWithFrequency = null; private IIndexMetadataFlagTable tableDocMetaWithFlag = null; SearchConfiguration sConf = null; private IndexWriter() throws InstantiationException { sConf = SearchConfiguration.getInstance(); } public IndexWriter(IIndexFrequencyTable tableFrequency) throws InstantiationException { this(); this.tableFrequency = tableFrequency; tableType = FREQUENCY_TABLE; } public IndexWriter(IIndexOffsetTable tableOffset) throws InstantiationException { this(); this.tableOffset = tableOffset; tableType = OFFSET_TABLE; } public IndexWriter(IIndexPositionsTable tablePosition) throws InstantiationException { this(); this.tablePositions = tablePosition; tableType = POSITION_TABLE; } public IndexWriter(IIndexMetadataFrequencyTable tableDocMetaWithFrequency) throws InstantiationException { this(); this.tableDocMetaWithFrequency = tableDocMetaWithFrequency; tableType = DOCMETA_FREQUENCY_TABLE; } public IndexWriter(IIndexMetadataFlagTable tableDocMetaWithFlag) throws InstantiationException { this(); this.tableDocMetaWithFlag = tableDocMetaWithFlag; tableType = DOCMETA_FLAG_TABLE; } public byte[] toBytes() throws IOException { if (this.cachedIndex.size() == 0) return null; return this.toBytes(this.cachedIndex, false); } public byte[] toBytes(List<IndexRow> rows, boolean isUnique) throws IOException { if (rows.size() == 0) return null; switch (tableType) { case FREQUENCY_TABLE: return toBytesFrequency(rows, isUnique); case OFFSET_TABLE: return toBytesOffset(rows, isUnique); case POSITION_TABLE: return toBytesPositions(rows, isUnique); case DOCMETA_FREQUENCY_TABLE: return toBytesDocMetaWithFrequency(rows, isUnique); case DOCMETA_FLAG_TABLE: return toBytesDocMetaWithFlag(rows, isUnique); default: throw new IOException("Unknown Index Type"); } } private byte[] toBytesFrequency(final List<IndexRow> rows, final boolean isUnique) throws IOException { this.tableFrequency.clear(); StringBuilder sb = null; String uniqueId = null; Set<String> uniqueRows = null; if (isUnique) { sb = new StringBuilder(1024); uniqueRows = new HashSet<String>(); } for (IndexRow row : rows) { int wordHash = row.hashCode(); if (isUnique) { sb.delete(0, sb.capacity()); sb.append(row.docType).append('\t').append(row.fieldType).append('\t').append(wordHash).append('\t') .append(row.docId).append('\t').append(row.occurance); uniqueId = sb.toString(); if (uniqueRows.contains(uniqueId)) continue; else uniqueRows.add(uniqueId); } this.tableFrequency.put(row.docType, row.fieldType, wordHash, row.docId, setPayloadWithOccurance(row.docType, row.fieldType, wordHash, row.docId, row.occurance)); } byte[] data = this.tableFrequency.toBytes(); if (null != uniqueRows) uniqueRows.clear(); this.tableFrequency.clear(); return data; } public int setPayloadWithOccurance(int docType, int fieldType, int wordHash, int docId, int occurance) { return occurance; } private byte[] toBytesOffset(final List<IndexRow> rows, final boolean isUnique) throws IOException { this.tableOffset.clear(); StringBuilder sb = null; String uniqueId = null; Set<String> uniqueRows = null; if (isUnique) { sb = new StringBuilder(1024); uniqueRows = new HashSet<String>(); } for (IndexRow row : rows) { int wordHash = row.hashCode(); if (isUnique) { sb.delete(0, sb.capacity()); sb.append(row.docType).append('\t').append(row.fieldType).append('\t').append(wordHash).append('\t') .append(row.docId); uniqueId = sb.toString(); if (uniqueRows.contains(uniqueId)) continue; else uniqueRows.add(uniqueId); } byte[] offsetB = SortedBytesInteger.getInstance().toBytes(row.offsetL); this.tableOffset.put(row.docType, row.fieldType, wordHash, row.docId, setPayloadWithOffsets(row.docType, row.fieldType, wordHash, row.docId, offsetB)); } byte[] data = this.tableOffset.toBytes(); if (null != uniqueRows) uniqueRows.clear(); this.tableOffset.clear(); return data; } public byte[] setPayloadWithOffsets(int docType, int fieldType, int wordHash, int docId, byte[] offsetB) { return offsetB; } private byte[] toBytesPositions(final List<IndexRow> rows, final boolean isUnique) throws IOException { this.tablePositions.clear(); StringBuilder sb = null; String uniqueId = null; Set<String> uniqueRows = null; if (isUnique) { sb = new StringBuilder(1024); uniqueRows = new HashSet<String>(); } for (IndexRow row : rows) { int wordHash = row.hashCode(); if (isUnique) { sb.delete(0, sb.capacity()); sb.append(row.docType).append('\t').append(row.fieldType).append('\t').append(wordHash).append('\t') .append(row.docId); uniqueId = sb.toString(); if (uniqueRows.contains(uniqueId)) continue; else uniqueRows.add(uniqueId); } byte[] positionsB = SortedBytesInteger.getInstance().toBytes(row.positionL); this.tablePositions.put(row.docType, row.fieldType, wordHash, row.docId, setPayloadWithPositions(row.docType, row.fieldType, wordHash, row.docId, positionsB)); } byte[] data = this.tablePositions.toBytes(); if (null != uniqueRows) uniqueRows.clear(); this.tablePositions.clear(); return data; } public byte[] setPayloadWithPositions(int docType, int fieldType, int wordHash, int docId, byte[] positionsB) { return positionsB; } private byte[] toBytesDocMetaWithFrequency(final List<IndexRow> rows, final boolean isUnique) throws IOException { this.tableDocMetaWithFrequency.clear(); StringBuilder sb = null; String uniqueId = null; Set<String> uniqueRows = null; if (isUnique) { sb = new StringBuilder(1024); uniqueRows = new HashSet<String>(); } for (IndexRow row : rows) { int wordHash = row.hashCode(); String docMeta = (null == row.docMeta) ? "-" : row.docMeta.getTexualFilterLine(); if (isUnique) { sb.delete(0, sb.capacity()); sb.append(row.docType).append('\t').append(row.fieldType).append('\t').append(docMeta).append('\t') .append(wordHash).append('\t').append(row.docId).append('\t').append(row.occurance); uniqueId = sb.toString(); if (uniqueRows.contains(uniqueId)) continue; else uniqueRows.add(uniqueId); } String docMetaB = (null == row.docMeta) ? "-" : row.docMeta.filter; this.tableDocMetaWithFrequency.put(row.docType, row.fieldType, docMetaB, wordHash, row.docId, setDocMetaWithOccurance(row.docType, row.fieldType, docMetaB, wordHash, row.docId, row.occurance)); } byte[] data = this.tableDocMetaWithFrequency.toBytes(); if (null != uniqueRows) uniqueRows.clear(); this.tableDocMetaWithFrequency.clear(); return data; } public int setDocMetaWithOccurance(int docType, int fieldType, String docMeta, int wordHash, int docId, int occurance) { return occurance; } private byte[] toBytesDocMetaWithFlag(final List<IndexRow> rows, final boolean isUnique) throws IOException { this.tableDocMetaWithFlag.clear(); StringBuilder sb = null; String uniqueId = null; Set<String> uniqueRows = null; if (isUnique) { sb = new StringBuilder(1024); uniqueRows = new HashSet<String>(); } for (IndexRow row : rows) { int wordHash = row.hashCode(); String docMeta = (null == row.docMeta) ? "-" : row.docMeta.getTexualFilterLine(); if (isUnique) { sb.delete(0, sb.capacity()); sb.append(row.docType).append('\t').append(row.fieldType).append('\t').append(docMeta).append('\t') .append(wordHash).append('\t').append(row.docId).append('\t').append(row.flag); uniqueId = sb.toString(); if (uniqueRows.contains(uniqueId)) continue; else uniqueRows.add(uniqueId); } String docMetaB = (null == row.docMeta) ? "-" : row.docMeta.filter; this.tableDocMetaWithFlag.put(row.docType, row.fieldType, docMetaB, wordHash, row.docId, setDocMetaWithFlag(row.docType, row.fieldType, docMetaB, wordHash, row.docId, row.flag)); } byte[] data = this.tableDocMetaWithFlag.toBytes(); if (null != uniqueRows) uniqueRows.clear(); this.tableDocMetaWithFlag.clear(); return data; } public boolean setDocMetaWithFlag(int docType, int fieldType, String docMeta, int wordHash, int docId, boolean flag) { return flag; } public void close() throws IOException { if (null != this.cachedIndex) this.cachedIndex.clear(); switch (tableType) { case FREQUENCY_TABLE: this.tableFrequency.clear(); break; case OFFSET_TABLE: this.tableOffset.clear(); break; case POSITION_TABLE: this.tablePositions.clear(); break; case DOCMETA_FREQUENCY_TABLE: this.tableDocMetaWithFrequency.clear(); break; case DOCMETA_FLAG_TABLE: this.tableDocMetaWithFlag.clear(); break; default: throw new IOException("Unknown Index Type"); } } public void addDocument(int docId, Document doc) throws IOException, InstantiationException { addDocument(docId, doc, unknownDocumentType); } public void addDocument(int docId, Document doc, String documentType) throws IOException, InstantiationException { addDocument(docId, doc, documentType, AnalyzerFactory.getInstance()); } public void addDocument(int docId, Document doc, String documentType, AnalyzerFactory analyzer) throws CorruptIndexException, IOException, InstantiationException { Map<String, IndexRow> uniqueRows = new HashMap<String, IndexWriter.IndexRow>(); addDocument(docId, doc, documentType, analyzer, uniqueRows); } public void addDocument(int docId, Document doc, String documentType, AnalyzerFactory analyzers, Map<String, IndexRow> uniqueTokens) throws CorruptIndexException, IOException, InstantiationException { addDocument(docId, doc, documentType, null, analyzers, uniqueTokens); } public void addDocument(int docId, Document doc, String documentType, DocumentMetadata docFilter, AnalyzerFactory analyzers) throws CorruptIndexException, IOException, InstantiationException { Map<String, IndexRow> uniqueTokens = new HashMap<String, IndexWriter.IndexRow>(); addDocument(docId, doc, documentType, docFilter, analyzers, uniqueTokens); } public void addDocument(int docId, Document doc, String documentType, DocumentMetadata docFilter, AnalyzerFactory analyzers, Map<String, IndexRow> uniqueTokens) throws CorruptIndexException, IOException, InstantiationException { int docType = sConf.getDocumentTypeCodes().getCode(documentType); for (Fieldable field : doc.getFields()) { uniqueTokens.clear(); int fieldType = sConf.getFieldTypeCodes().getCode(field.name()); if (field.isTokenized()) { StringReader sr = new StringReader(field.stringValue()); TokenStream stream = analyzers.getAnalyzer(field.name()).tokenStream(field.name(), sr); tokenize(stream, docId, docType, docFilter, fieldType, uniqueTokens); sr.close(); } else { IndexRow row = new IndexRow(docId, field.stringValue(), docType, fieldType, 0, 0); if (null != docFilter) row.docMeta = docFilter; cachedIndex.add(row); } } if (null != uniqueTokens) uniqueTokens.clear(); } public void commit(String tableName, String mergeId, String indexName, boolean keepDuplicates) throws IOException { HBaseTableSchemaDefn schema = HBaseTableSchemaDefn.getInstance(tableName); if (!schema.columnPartions.containsKey(indexName)) { throw new IOException("Unable to find partion points for " + indexName + ". Please initialize schema"); } Map<String, Map<Character, List<IndexRow>>> partitionCells = new HashMap<String, Map<Character, List<IndexRow>>>( schema.columnPartions.size()); segregateOnFamilyColumn(indexName, schema, partitionCells); for (String family : partitionCells.keySet()) { Map<Character, List<IndexRow>> cols = partitionCells.get(family); for (Character column : cols.keySet()) { List<IndexRow> rows = cols.get(column); byte[] data = null; data = getBytes(rows, keepDuplicates); if (null == data) continue; byte[] colNameBytes = new String(new char[] { column }).getBytes(); RecordScalar mergedTable = new RecordScalar(mergeId.getBytes(), new NV(family.getBytes(), colNameBytes, data)); List<RecordScalar> records = new ArrayList<RecordScalar>(1); records.add(mergedTable); HWriter.getInstance(true).insertScalar(tableName, records); } } } public final byte[] getBytes(boolean keepDuplicates) throws IOException { return getBytes(this.cachedIndex, keepDuplicates); } public final byte[] getBytes(List<IndexRow> rows, boolean keepDuplicates) throws IOException { byte[] data = null; switch (tableType) { case FREQUENCY_TABLE: data = this.toBytes(rows, keepDuplicates); if (INFO_ENABLED) IdSearchLog.l.info("Total rows in frequency table:\t" + rows.size()); break; case POSITION_TABLE: for (IndexRow row : rows) { byte[] positionsB = SortedBytesInteger.getInstance().toBytes(row.positionL); this.tablePositions.put(row.docType, row.fieldType, row.hashCode(), row.docId, positionsB); } data = this.toBytes(rows, keepDuplicates); if (INFO_ENABLED) IdSearchLog.l.info("Total rows in position table:\t" + rows.size()); this.tablePositions.clear(); break; case OFFSET_TABLE: for (IndexRow row : rows) { byte[] offsetB = SortedBytesInteger.getInstance().toBytes(row.offsetL); this.tableOffset.put(row.docType, row.fieldType, row.hashCode(), row.docId, offsetB); } data = this.toBytes(rows, keepDuplicates); if (INFO_ENABLED) IdSearchLog.l.info("Total rows in offset table:\t" + rows.size()); this.tableOffset.clear(); break; case DOCMETA_FREQUENCY_TABLE: data = this.toBytes(rows, keepDuplicates); if (INFO_ENABLED) IdSearchLog.l.info("Total rows in docmeta table:\t" + rows.size()); break; case DOCMETA_FLAG_TABLE: data = this.toBytes(rows, keepDuplicates); if (INFO_ENABLED) IdSearchLog.l.info("Total rows in docmeta flag table:\t" + rows.size()); break; default: throw new IOException("Unknown Index Type"); } return data; } public void segregateOnFamilyColumn(String field, HBaseTableSchemaDefn schema, Map<String, Map<Character, List<IndexRow>>> partitionCells) throws IOException { @SuppressWarnings("rawtypes") IPartition partitions = schema.columnPartions.get(field); for (IndexRow row : this.cachedIndex) { String token = row.token; @SuppressWarnings("unchecked") String family = partitions.getColumnFamily(new Integer(Hashing.hash(token)).toString()); char colName = HBaseTableSchemaDefn.getColumnName(Hashing.hash(token)); Map<Character, List<IndexRow>> familyMap = null; if (partitionCells.containsKey(family)) { familyMap = partitionCells.get(family); } else { familyMap = new HashMap<Character, List<IndexRow>>(); partitionCells.put(family, familyMap); } List<IndexRow> rows = null; if (familyMap.containsKey(colName)) { rows = familyMap.get(colName); } else { rows = new ArrayList<IndexWriter.IndexRow>(); familyMap.put(colName, rows); } rows.add(row); } } /** * Find the last offset. * Find each term offset * * @param stream * @param docId * @param docType * @param fieldType * @param fieldBoost * @param codecs * @param uniqueTokens * @throws IOException */ private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType, Map<String, IndexRow> uniqueTokens) throws IOException { String token = null; int curoffset = 0; int lastoffset = 0; int position = -1; StringBuilder sb = new StringBuilder(); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { token = termA.toString(); curoffset = offsetA.endOffset(); if (lastoffset != curoffset) position++; lastoffset = curoffset; String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter); sb.delete(0, sb.capacity()); if (uniqueTokens.containsKey(key)) { IndexRow existingRow = uniqueTokens.get(key); existingRow.set(curoffset, position); existingRow.occurance++; } else { IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position); if (null != filter) row.docMeta = filter; uniqueTokens.put(key, row); } } stream.end(); stream.close(); for (IndexRow row : uniqueTokens.values()) cachedIndex.add(row); } private static class IndexRow { DocumentMetadata docMeta = null; public int docId; public String token; public int docType; public int fieldType; public List<Integer> offsetL = new ArrayList<Integer>(); public List<Integer> positionL = new ArrayList<Integer>(); public int occurance = 1; public boolean flag = true; public IndexRow(int docId, String token, int docType, int fieldType, int offset, int position) { this(docId, token, docType, fieldType); set(offset, position); } public IndexRow(int docId, String token, int docType, int fieldType) { this.docId = docId; this.token = token; this.docType = docType; this.fieldType = fieldType; } public void set(int offset, int position) { this.offsetL.add(offset); this.positionL.add(position); } @Override public int hashCode() { return Hashing.hash(token); } public static String generateKey(StringBuilder sb, int docId, String token, int docType, int fieldType, DocumentMetadata meta) { sb.append(docType).append('|').append(token).append('|').append(docId).append('|').append(fieldType); if (null != meta) sb.append('|').append(meta.getTexualFilterLine()); return sb.toString(); } } } /** double occuranceBoost = Math.log10(row.occurance); if ( occuranceBoost > 1 ) occuranceBoost = 1; double locationBoost = ( lastOffset - row.offset)/ lastOffset; float finalBoost = (float) ( row.boost + occuranceBoost + locationBoost ); */