Example usage for org.apache.lucene.util BytesRef BytesRef

List of usage examples for org.apache.lucene.util BytesRef BytesRef

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef BytesRef.

Prototype

public BytesRef() 

Source Link

Document

Create a BytesRef with #EMPTY_BYTES

Usage

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;/*from   w  ww.  j  av  a  2 s  . c om*/
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:com.kmwllc.search.graph.GraphTermsCollector.java

private void addEdgeIdsToResult(int doc) throws IOException {
    // set the doc to pull the edges ids for.
    docTermOrds.setDocument(doc);//from w  w  w.  ja va2s  .  co m
    // TODO: why is this final?
    BytesRef scratch = new BytesRef();
    long ord;
    while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
        scratch = docTermOrds.lookupOrd(ord);
        // add the edge id to the collector terms.
        // TODO: how do we handle non-string type fields?
        // do i need to worry about that here?
        collectorTerms.add(scratch);
    }
}

From source file:com.lucid.solr.sidecar.SidecarIndexReaderFactory.java

License:Apache License

DirectoryReader buildParallelReader(DirectoryReader main, SolrIndexSearcher source, boolean rebuild) {
    try {/* w  ww.  ja v a 2s  . com*/
        if (source == null) {
            throw new Exception("Source collection is missing.");
        }
        // create as a sibling path of the main index
        Directory d = main.directory();
        File primaryDir = null;
        if (d instanceof FSDirectory) {
            String path = ((FSDirectory) d).getDirectory().getPath();
            primaryDir = new File(path);
            sidecarIndex = new File(primaryDir.getParentFile(), sidecarIndexLocation);
        } else {
            String secondaryPath = System.getProperty("java.io.tmpdir") + File.separator + sidecarIndexLocation
                    + "-" + System.currentTimeMillis();
            sidecarIndex = new File(secondaryPath);
        }
        // create a new tmp dir for the secondary indexes
        File secondaryIndex = new File(sidecarIndex, System.currentTimeMillis() + "-index");
        if (rebuild) {
            safeDelete(sidecarIndex);
        }
        parallelFields.addAll(source.getFieldNames());
        parallelFields.remove("id");
        LOG.debug("building a new index");
        Directory dir = FSDirectory.open(secondaryIndex);
        if (IndexWriter.isLocked(dir)) {
            // try forcing unlock
            try {
                IndexWriter.unlock(dir);
            } catch (Exception e) {
                LOG.warn("Failed to unlock " + secondaryIndex);
            }
        }
        int[] mergeTargets;
        AtomicReader[] subReaders = SidecarIndexReader.getSequentialSubReaders(main);
        if (subReaders == null || subReaders.length == 0) {
            mergeTargets = new int[] { main.maxDoc() };
        } else {
            mergeTargets = new int[subReaders.length];
            for (int i = 0; i < subReaders.length; i++) {
                mergeTargets[i] = subReaders[i].maxDoc();
            }
        }
        Version ver = currentCore.getLatestSchema().getDefaultLuceneMatchVersion();
        IndexWriterConfig cfg = new IndexWriterConfig(ver, currentCore.getLatestSchema().getAnalyzer());
        //cfg.setInfoStream(System.err);
        cfg.setMergeScheduler(new SerialMergeScheduler());
        cfg.setMergePolicy(new SidecarMergePolicy(mergeTargets, false));
        IndexWriter iw = new IndexWriter(dir, cfg);
        LOG.info("processing " + main.maxDoc() + " docs / " + main.numDeletedDocs() + " dels in main index");
        int boostedDocs = 0;
        Bits live = MultiFields.getLiveDocs(main);

        int targetPos = 0;
        int nextTarget = mergeTargets[targetPos];
        BytesRef idRef = new BytesRef();
        for (int i = 0; i < main.maxDoc(); i++) {
            if (i == nextTarget) {
                iw.commit();
                nextTarget = nextTarget + mergeTargets[++targetPos];
            }
            if (live != null && !live.get(i)) {
                addDummy(iw); // this is required to preserve doc numbers.
                continue;
            } else {
                DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(docIdField);
                main.document(i, visitor);
                Document doc = visitor.getDocument();
                // get docId
                String id = doc.get(docIdField);
                if (id == null) {
                    LOG.debug("missing id, docNo=" + i);
                    addDummy(iw);
                    continue;
                } else {
                    // find the data, if any
                    doc = lookup(source, id, idRef, parallelFields);
                    if (doc == null) {
                        LOG.debug("missing boost data, docId=" + id);
                        addDummy(iw);
                        continue;
                    } else {
                        LOG.debug("adding boost data, docId=" + id + ", b=" + doc);
                        iw.addDocument(doc);
                        boostedDocs++;
                    }
                }
            }
        }
        iw.close();
        DirectoryReader other = DirectoryReader.open(dir);
        LOG.info("SidecarIndexReader with " + boostedDocs + " boosted documents.");
        SidecarIndexReader pr = createSidecarIndexReader(main, other, sourceCollection, secondaryIndex);
        return pr;
    } catch (Exception e) {
        LOG.warn("Unable to build parallel index: " + e.toString(), e);
        LOG.warn("Proceeding with single main index.");
        try {
            return new SidecarIndexReader(this, main, null, SidecarIndexReader.getSequentialSubReaders(main),
                    sourceCollection, null);
        } catch (Exception e1) {
            LOG.warn("Unexpected exception, returning single main index", e1);
            return main;
        }
    }
}

From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java

License:Apache License

/** Sole constructor. */
public CompressingStoredFieldsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
        IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
    this.compressionMode = compressionMode;
    final String segment = si.name;
    boolean success = false;
    fieldInfos = fn;/*from  ww w. j  a va2 s  .c om*/
    numDocs = si.getDocCount();
    ChecksumIndexInput indexStream = null;
    try {
        final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix,
                FIELDS_INDEX_EXTENSION);
        final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
        // Load the index into memory
        indexStream = d.openChecksumInput(indexStreamFN, context);
        final String codecNameIdx = formatName + CODEC_SFX_IDX;
        version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
        assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
        indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);

        long maxPointer = -1;

        if (version >= VERSION_CHECKSUM) {
            maxPointer = indexStream.readVLong();
            CodecUtil.checkFooter(indexStream);
        } else {
            CodecUtil.checkEOF(indexStream);
        }
        indexStream.close();
        indexStream = null;

        // Open the data file and read metadata
        fieldsStream = d.openInput(fieldsStreamFN, context);
        if (version >= VERSION_CHECKSUM) {
            if (maxPointer + CodecUtil.footerLength() != fieldsStream.length()) {
                throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer="
                        + maxPointer + ", length=" + fieldsStream.length());
            }
        } else {
            maxPointer = fieldsStream.length();
        }
        this.maxPointer = maxPointer;
        final String codecNameDat = formatName + CODEC_SFX_DAT;
        final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START,
                VERSION_CURRENT);
        if (version != fieldsVersion) {
            throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version
                    + " != " + fieldsVersion);
        }
        assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer();

        if (version >= VERSION_BIG_CHUNKS) {
            chunkSize = fieldsStream.readVInt();
        } else {
            chunkSize = -1;
        }
        packedIntsVersion = fieldsStream.readVInt();
        decompressor = compressionMode.newDecompressor();
        this.bytes = new BytesRef();

        if (version >= VERSION_CHECKSUM) {
            // NOTE: data file is too costly to verify checksum against all the bytes on open,
            // but for now we at least verify proper structure of the checksum footer: which looks
            // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
            // such as file truncation.
            CodecUtil.retrieveChecksum(fieldsStream);
        }

        success = true;
    } finally {
        if (!success) {
            IOUtils.closeWhileHandlingException(this, indexStream);
        }
    }
}

From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java

License:Apache License

@Override
public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException {
    fieldsStream.seek(indexReader.getStartPointer(docID));

    final int docBase = fieldsStream.readVInt();
    final int chunkDocs = fieldsStream.readVInt();
    if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
        throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs="
                + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
    }//ww  w.j a  v  a2  s  . co m

    final int numStoredFields, offset, length, totalLength;
    if (chunkDocs == 1) {
        numStoredFields = fieldsStream.readVInt();
        offset = 0;
        length = fieldsStream.readVInt();
        totalLength = length;
    } else {
        final int bitsPerStoredFields = fieldsStream.readVInt();
        if (bitsPerStoredFields == 0) {
            numStoredFields = fieldsStream.readVInt();
        } else if (bitsPerStoredFields > 31) {
            throw new CorruptIndexException(
                    "bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
        } else {
            final long filePointer = fieldsStream.getFilePointer();
            final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream,
                    PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
            numStoredFields = (int) (reader.get(docID - docBase));
            fieldsStream.seek(filePointer
                    + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields));
        }

        final int bitsPerLength = fieldsStream.readVInt();
        if (bitsPerLength == 0) {
            length = fieldsStream.readVInt();
            offset = (docID - docBase) * length;
            totalLength = chunkDocs * length;
        } else if (bitsPerStoredFields > 31) {
            throw new CorruptIndexException(
                    "bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
        } else {
            final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream,
                    PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
            int off = 0;
            for (int i = 0; i < docID - docBase; ++i) {
                off += it.next();
            }
            offset = off;
            length = (int) it.next();
            off += length;
            for (int i = docID - docBase + 1; i < chunkDocs; ++i) {
                off += it.next();
            }
            totalLength = off;
        }
    }

    if ((length == 0) != (numStoredFields == 0)) {
        throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields
                + " (resource=" + fieldsStream + ")");
    }
    if (numStoredFields == 0) {
        // nothing to do
        return;
    }

    final DataInput documentInput;
    if (version >= VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) {
        assert chunkSize > 0;
        assert offset < chunkSize;

        decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes);
        documentInput = new DataInput() {

            int decompressed = bytes.length;

            void fillBuffer() throws IOException {
                assert decompressed <= length;
                if (decompressed == length) {
                    throw new EOFException();
                }
                final int toDecompress = Math.min(length - decompressed, chunkSize);
                decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes);
                decompressed += toDecompress;
            }

            @Override
            public byte readByte() throws IOException {
                if (bytes.length == 0) {
                    fillBuffer();
                }
                --bytes.length;
                return bytes.bytes[bytes.offset++];
            }

            @Override
            public void readBytes(byte[] b, int offset, int len) throws IOException {
                while (len > bytes.length) {
                    System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length);
                    len -= bytes.length;
                    offset += bytes.length;
                    fillBuffer();
                }
                System.arraycopy(bytes.bytes, bytes.offset, b, offset, len);
                bytes.offset += len;
                bytes.length -= len;
            }

        };
    } else {
        final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
        decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
        assert bytes.length == length;
        documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
    }

    for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) {
        final long infoAndBits = documentInput.readVLong();
        final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
        final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);

        final int bits = (int) (infoAndBits & TYPE_MASK);
        assert bits <= NUMERIC_DOUBLE : "bits=" + Integer.toHexString(bits);

        //get restricted
        FieldVisibility cv = RestrictedStoredFieldVisitor.EMPTY;
        boolean isRestricted = documentInput.readByte() == 1;
        if (isRestricted) {
            int cv_length = documentInput.readVInt();
            byte[] cv_bytes = new byte[cv_length];
            documentInput.readBytes(cv_bytes, 0, cv_length);
            cv = new FieldVisibility(cv_bytes);
        }

        RestrictedStoredFieldVisitor restrictedStoredFieldVisitor = DelegatingRestrictedFieldVisitor
                .wrap(visitor);
        if (evaluate(cv)) {
            switch (restrictedStoredFieldVisitor.needsField(fieldInfo, cv)) {
            case YES:
                readField(documentInput, restrictedStoredFieldVisitor, fieldInfo, bits, cv);
                break;
            case NO:
                skipField(documentInput, bits, cv);
                break;
            case STOP:
                return;
            }
        } else {
            skipField(documentInput, bits, cv);
        }
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java

License:Apache License

private static BytesRef readBytesRef(IndexInput in) throws IOException {
    BytesRef bytes = new BytesRef();
    bytes.length = in.readVInt();//from   w w  w  .jav  a 2s.co  m
    bytes.bytes = new byte[bytes.length];
    in.readBytes(bytes.bytes, 0, bytes.length);
    return bytes;
}

From source file:com.rondhuit.w2v.lucene.LuceneIndexCorpus.java

License:Apache License

@Override
public void learnVocab() throws IOException {
    super.learnVocab();

    final String field = ((LuceneIndexConfig) config).getField();
    final Terms terms = MultiFields.getTerms(reader, field);
    final BytesRef maxTerm = terms.getMax();
    final BytesRef minTerm = terms.getMin();
    Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    topDocs = searcher.search(q, Integer.MAX_VALUE);

    TermsEnum termsEnum = null;/* ww  w. j  a  v a 2 s.  com*/
    termsEnum = terms.iterator(termsEnum);

    termsEnum.seekCeil(new BytesRef());
    BytesRef term = termsEnum.term();
    while (term != null) {
        int p = addWordToVocab(term.utf8ToString());
        vocab[p].setCn((int) termsEnum.totalTermFreq());
        term = termsEnum.next();
    }
}

From source file:com.shaie.PhraseVsSpanQuery.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", new TokenStream() {
        final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class);
        final CharTermAttribute term = addAttribute(CharTermAttribute.class);
        boolean first = true, done = false;

        @Override// ww w .j av a2  s  . co m
        public boolean incrementToken() throws IOException {
            if (done) {
                return false;
            }
            if (first) {
                term.setEmpty().append("a");
                pos.setPositionIncrement(1);
                first = false;
            } else {
                term.setEmpty().append("b");
                pos.setPositionIncrement(0);
                done = true;
            }
            return true;
        }
    }));
    writer.addDocument(doc);
    writer.close();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final LeafReader ar = reader.leaves().get(0).reader();
    final TermsEnum te = ar.terms("f").iterator();
    BytesRef scratch = new BytesRef();
    while ((scratch = te.next()) != null) {
        System.out.println(scratch.utf8ToString());
        final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString()));
        System.out.println("  doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition());
    }

    System.out.println();

    // try a phrase query with a slop
    final PhraseQuery pqNoSlop = buildPhraseQuery(0);
    System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits);

    final PhraseQuery pqSlop1 = buildPhraseQuery(1);
    System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits);

    final PhraseQuery pqSlop3 = buildPhraseQuery(3);
    System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits);

    final SpanNearQuery snqUnOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            false);
    System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = "
            + searcher.search(snqUnOrdered, 10).totalHits);

    final SpanNearQuery snqOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            true);
    System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = "
            + searcher.search(snqOrdered, 10).totalHits);

    reader.close();
}

From source file:com.sindicetech.siren.search.node.NodeConstantScoreAutoRewrite.java

License:Open Source License

@Override
public Query rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {

    // Disabled cutoffs
    final int docCountCutoff = Integer.MAX_VALUE;
    final int termCountLimit = Integer.MAX_VALUE;

    final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
    this.collectTerms(reader, query, col);
    final int size = col.pendingTerms.size();

    if (col.hasCutOff) {
        return MultiNodeTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
    } else if (size == 0) {
        return this.getTopLevelQuery(query);
    } else {//from   w w  w.  j ava2  s.c  om
        final NodeBooleanQuery bq = this.getTopLevelQuery(query);
        final BytesRefHash pendingTerms = col.pendingTerms;
        final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            // docFreq is not used for constant score here, we pass 1
            // to explicitely set a fake value, so it's not calculated
            this.addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f,
                    col.array.termState[pos]);
        }
        // Strip scores
        final NodeQuery result = new NodeConstantScoreQuery(bq);
        result.setBoost(query.getBoost());
        return result;
    }
}

From source file:com.sindicetech.siren.search.node.NodeScoringRewrite.java

License:Open Source License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final Q result = this.getTopLevelQuery(query);
    final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
    this.collectTerms(reader, query, col);

    final int size = col.terms.size();
    if (size > 0) {
        final int sort[] = col.terms.sort(col.termsEnum.getComparator());
        final float[] boost = col.array.boost;
        final TermContext[] termStates = col.array.termState;
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
            assert reader.docFreq(term) == termStates[pos].docFreq();
            this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos],
                    termStates[pos]);// ww w .  j  a v  a2 s.  co m
        }
    }
    return result;
}