List of usage examples for org.apache.lucene.analysis Analyzer getPositionIncrementGap
public int getPositionIncrementGap(String fieldName)
From source file:com.liferay.portal.search.lucene.PerFieldAnalyzerWrapper.java
License:Open Source License
@Override public int getPositionIncrementGap(String fieldName) { Analyzer analyzer = _getAnalyzer(fieldName); return analyzer.getPositionIncrementGap(fieldName); }
From source file:lucandra.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { List<String> allIndexedTerms = new ArrayList<String>(); // check for special field name String docId = doc.get(CassandraUtils.documentIdField); if (docId == null) docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime()))); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = CassandraUtils.createColumnName(field.name(), field.stringValue()); allIndexedTerms.add(term);// w ww.ja va 2s . c om String key = indexName + CassandraUtils.delimeter + term; Map<String, List<Number>> termMap = new HashMap<String, List<Number>>(); termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray); termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray); CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap); } else if (field.isIndexed()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } // collect term information per field Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms tokens.reset(); // reset the TokenStream to the first token // set up token attributes we are working on // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; String term = CassandraUtils.createColumnName(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<String, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new HashMap<String, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency { List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(CassandraUtils.termFrequencyKey, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); } // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.positionVectorKey, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.offsetVectorKey, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) { // Terms are stored within a unique key combination // This is required since cassandra loads all columns // in a key/column family into memory String key = indexName + CassandraUtils.delimeter + term.getKey(); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(CassandraUtils.normsKey, bnorm); } CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue()); } } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8"); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null); } } // Finally, Store meta-data so we can delete this document String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key), CassandraUtils.toBytes(allIndexedTerms), null); if (autoCommit) CassandraUtils.robustBatchInsert(client, getMutationMap()); }
From source file:org.hbasene.index.HBaseIndexWriter.java
License:Apache License
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { String docId = doc.get(this.primaryKeyField); if (docId == null) { throw new IllegalArgumentException( "Primary Key " + this.primaryKeyField + " not present in the document to be added "); // TODO: Special type of exception needed ? }//www .jav a2 s . c om int position = 0; Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>(); Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>(); for (Fieldable field : doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } tokens.addAttribute(TermAttribute.class); tokens.addAttribute(PositionIncrementAttribute.class); // collect term frequencies per doc if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms while (tokens.incrementToken()) { String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term()); List<Integer> pvec = termPositions.get(term); if (pvec == null) { pvec = Lists.newArrayList(); termPositions.put(term, pvec); } position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1); pvec.add(++position); } tokens.close(); } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = this.createColumnName(field.name(), field.stringValue()); String key = term; termPositions.put(key, EMPTY_TERM_POSITIONS); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T')); fieldsToStore.put(field.name(), Bytes.add(prefix, value)); } } indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore)); termPositions.clear(); fieldsToStore.clear(); }
From source file:org.solbase.lucenehbase.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException { // given doc, what are all of terms we indexed List<Term> allIndexedTerms = new ArrayList<Term>(); Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024); // need to hold onto TermDocMetaData, so it can return this array List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>(); byte[] docId = Bytes.toBytes(docNumber); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); }//w ww .jav a2s . c o m // collect term information per field Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } tokens.reset(); // reset the TokenStream to the first token // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document // rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; Term term = new Term(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } List<Number> sortValues = new ArrayList<Number>(); // init sortValues for (int i = 0; i < Scorer.numSort; i++) { sortValues.add(new Integer(-1)); } int order = 0; // extract sort field value and store it in term doc metadata obj for (String fieldName : sortFieldNames) { Fieldable fieldable = doc.getFieldable(fieldName); if (fieldable instanceof EmbeddedSortField) { EmbeddedSortField sortField = (EmbeddedSortField) fieldable; int value = -1; if (sortField.stringValue() != null) { value = Integer.parseInt(sortField.stringValue()); } int sortSlot = sortField.getSortSlot(); sortValues.set(sortSlot - 1, new Integer(value)); } else { // TODO: this logic is used for real time indexing. // hacky. depending on order of sort field names in array int value = -1; if (fieldable.stringValue() != null) { value = Integer.parseInt(fieldable.stringValue()); } sortValues.set(order++, new Integer(value)); } } termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues); } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) { Term tempTerm = term.getKey(); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm); } TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm); metadatas.add(data); } } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { Term term = new Term(field.name(), field.stringValue()); allIndexedTerms.add(term); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term); Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {})); termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {})); TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term); metadatas.add(data); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); // logic to handle multiple fields w/ same name byte[] currentValue = fieldCache.get(field.name()); if (currentValue == null) { fieldCache.put(field.name(), value); } else { // append new data byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1]; System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1); System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length); System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length); fieldCache.put(field.name(), newValue); } } } Put documentPut = new Put(SolbaseUtil.randomize(docNumber)); // Store each field as a column under this docId for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) { documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue()); } // in case of real time update, we need to add back docId field if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) { byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString()); // first byte flags if binary or not byte[] value = new byte[docIdStr.length + 1]; System.arraycopy(docIdStr, 0, value, 0, docIdStr.length); value[value.length - 1] = (byte) (Byte.MIN_VALUE); documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value); } // Finally, Store meta-data so we can delete this document documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array()); ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms); return parsedDoc; }