Example usage for org.apache.lucene.util BytesRef BytesRef

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef BytesRef.

Prototype

public BytesRef()

Source Link

Document

Create a BytesRef with #EMPTY_BYTES

Usage

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;/*from   w  ww.  j  av  a  2 s  . c om*/
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:com.kmwllc.search.graph.GraphTermsCollector.java

private void addEdgeIdsToResult(int doc) throws IOException {
    // set the doc to pull the edges ids for.
    docTermOrds.setDocument(doc);//from w  w  w.  ja va2s  .  co m
    // TODO: why is this final?
    BytesRef scratch = new BytesRef();
    long ord;
    while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
        scratch = docTermOrds.lookupOrd(ord);
        // add the edge id to the collector terms.
        // TODO: how do we handle non-string type fields?
        // do i need to worry about that here?
        collectorTerms.add(scratch);
    }
}

From source file:com.lucid.solr.sidecar.SidecarIndexReaderFactory.java

License:Apache License

DirectoryReader buildParallelReader(DirectoryReader main, SolrIndexSearcher source, boolean rebuild) {
    try {/* w  ww.  ja v a 2s  . com*/
        if (source == null) {
            throw new Exception("Source collection is missing.");
        }
        // create as a sibling path of the main index
        Directory d = main.directory();
        File primaryDir = null;
        if (d instanceof FSDirectory) {
            String path = ((FSDirectory) d).getDirectory().getPath();
            primaryDir = new File(path);
            sidecarIndex = new File(primaryDir.getParentFile(), sidecarIndexLocation);
        } else {
            String secondaryPath = System.getProperty("java.io.tmpdir") + File.separator + sidecarIndexLocation
                    + "-" + System.currentTimeMillis();
            sidecarIndex = new File(secondaryPath);
        }
        // create a new tmp dir for the secondary indexes
        File secondaryIndex = new File(sidecarIndex, System.currentTimeMillis() + "-index");
        if (rebuild) {
            safeDelete(sidecarIndex);
        }
        parallelFields.addAll(source.getFieldNames());
        parallelFields.remove("id");
        LOG.debug("building a new index");
        Directory dir = FSDirectory.open(secondaryIndex);
        if (IndexWriter.isLocked(dir)) {
            // try forcing unlock
            try {
                IndexWriter.unlock(dir);
            } catch (Exception e) {
                LOG.warn("Failed to unlock " + secondaryIndex);
            }
        }
        int[] mergeTargets;
        AtomicReader[] subReaders = SidecarIndexReader.getSequentialSubReaders(main);
        if (subReaders == null || subReaders.length == 0) {
            mergeTargets = new int[] { main.maxDoc() };
        } else {
            mergeTargets = new int[subReaders.length];
            for (int i = 0; i < subReaders.length; i++) {
                mergeTargets[i] = subReaders[i].maxDoc();
            }
        }
        Version ver = currentCore.getLatestSchema().getDefaultLuceneMatchVersion();
        IndexWriterConfig cfg = new IndexWriterConfig(ver, currentCore.getLatestSchema().getAnalyzer());
        //cfg.setInfoStream(System.err);
        cfg.setMergeScheduler(new SerialMergeScheduler());
        cfg.setMergePolicy(new SidecarMergePolicy(mergeTargets, false));
        IndexWriter iw = new IndexWriter(dir, cfg);
        LOG.info("processing " + main.maxDoc() + " docs / " + main.numDeletedDocs() + " dels in main index");
        int boostedDocs = 0;
        Bits live = MultiFields.getLiveDocs(main);

        int targetPos = 0;
        int nextTarget = mergeTargets[targetPos];
        BytesRef idRef = new BytesRef();
        for (int i = 0; i < main.maxDoc(); i++) {
            if (i == nextTarget) {
                iw.commit();
                nextTarget = nextTarget + mergeTargets[++targetPos];
            }
            if (live != null && !live.get(i)) {
                addDummy(iw); // this is required to preserve doc numbers.
                continue;
            } else {
                DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(docIdField);
                main.document(i, visitor);
                Document doc = visitor.getDocument();
                // get docId
                String id = doc.get(docIdField);
                if (id == null) {
                    LOG.debug("missing id, docNo=" + i);
                    addDummy(iw);
                    continue;
                } else {
                    // find the data, if any
                    doc = lookup(source, id, idRef, parallelFields);
                    if (doc == null) {
                        LOG.debug("missing boost data, docId=" + id);
                        addDummy(iw);
                        continue;
                    } else {
                        LOG.debug("adding boost data, docId=" + id + ", b=" + doc);
                        iw.addDocument(doc);
                        boostedDocs++;
                    }
                }
            }
        }
        iw.close();
        DirectoryReader other = DirectoryReader.open(dir);
        LOG.info("SidecarIndexReader with " + boostedDocs + " boosted documents.");
        SidecarIndexReader pr = createSidecarIndexReader(main, other, sourceCollection, secondaryIndex);
        return pr;
    } catch (Exception e) {
        LOG.warn("Unable to build parallel index: " + e.toString(), e);
        LOG.warn("Proceeding with single main index.");
        try {
            return new SidecarIndexReader(this, main, null, SidecarIndexReader.getSequentialSubReaders(main),
                    sourceCollection, null);
        } catch (Exception e1) {
            LOG.warn("Unexpected exception, returning single main index", e1);
            return main;
        }
    }
}

From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java

License:Apache License

/** Sole constructor. */
public CompressingStoredFieldsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
        IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
    this.compressionMode = compressionMode;
    final String segment = si.name;
    boolean success = false;
    fieldInfos = fn;/*from  ww w. j  a va2 s  .c om*/
    numDocs = si.getDocCount();
    ChecksumIndexInput indexStream = null;
    try {
        final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix,
                FIELDS_INDEX_EXTENSION);
        final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION);
        // Load the index into memory
        indexStream = d.openChecksumInput(indexStreamFN, context);
        final String codecNameIdx = formatName + CODEC_SFX_IDX;
        version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT);
        assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();
        indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);

        long maxPointer = -1;

        if (version >= VERSION_CHECKSUM) {
            maxPointer = indexStream.readVLong();
            CodecUtil.checkFooter(indexStream);
        } else {
            CodecUtil.checkEOF(indexStream);
        }
        indexStream.close();
        indexStream = null;

        // Open the data file and read metadata
        fieldsStream = d.openInput(fieldsStreamFN, context);
        if (version >= VERSION_CHECKSUM) {
            if (maxPointer + CodecUtil.footerLength() != fieldsStream.length()) {
                throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer="
                        + maxPointer + ", length=" + fieldsStream.length());
            }
        } else {
            maxPointer = fieldsStream.length();
        }
        this.maxPointer = maxPointer;
        final String codecNameDat = formatName + CODEC_SFX_DAT;
        final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START,
                VERSION_CURRENT);
        if (version != fieldsVersion) {
            throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version
                    + " != " + fieldsVersion);
        }
        assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer();

        if (version >= VERSION_BIG_CHUNKS) {
            chunkSize = fieldsStream.readVInt();
        } else {
            chunkSize = -1;
        }
        packedIntsVersion = fieldsStream.readVInt();
        decompressor = compressionMode.newDecompressor();
        this.bytes = new BytesRef();

        if (version >= VERSION_CHECKSUM) {
            // NOTE: data file is too costly to verify checksum against all the bytes on open,
            // but for now we at least verify proper structure of the checksum footer: which looks
            // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
            // such as file truncation.
            CodecUtil.retrieveChecksum(fieldsStream);
        }

        success = true;
    } finally {
        if (!success) {
            IOUtils.closeWhileHandlingException(this, indexStream);
        }
    }
}

From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java

License:Apache License

@Override
public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException {
    fieldsStream.seek(indexReader.getStartPointer(docID));

    final int docBase = fieldsStream.readVInt();
    final int chunkDocs = fieldsStream.readVInt();
    if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
        throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs="
                + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
    }//ww  w.j a  v  a2  s  . co m

    final int numStoredFields, offset, length, totalLength;
    if (chunkDocs == 1) {
        numStoredFields = fieldsStream.readVInt();
        offset = 0;
        length = fieldsStream.readVInt();
        totalLength = length;
    } else {
        final int bitsPerStoredFields = fieldsStream.readVInt();
        if (bitsPerStoredFields == 0) {
            numStoredFields = fieldsStream.readVInt();
        } else if (bitsPerStoredFields > 31) {
            throw new CorruptIndexException(
                    "bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
        } else {
            final long filePointer = fieldsStream.getFilePointer();
            final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream,
                    PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
            numStoredFields = (int) (reader.get(docID - docBase));
            fieldsStream.seek(filePointer
                    + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields));
        }

        final int bitsPerLength = fieldsStream.readVInt();
        if (bitsPerLength == 0) {
            length = fieldsStream.readVInt();
            offset = (docID - docBase) * length;
            totalLength = chunkDocs * length;
        } else if (bitsPerStoredFields > 31) {
            throw new CorruptIndexException(
                    "bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
        } else {
            final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream,
                    PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
            int off = 0;
            for (int i = 0; i < docID - docBase; ++i) {
                off += it.next();
            }
            offset = off;
            length = (int) it.next();
            off += length;
            for (int i = docID - docBase + 1; i < chunkDocs; ++i) {
                off += it.next();
            }
            totalLength = off;
        }
    }

    if ((length == 0) != (numStoredFields == 0)) {
        throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields
                + " (resource=" + fieldsStream + ")");
    }
    if (numStoredFields == 0) {
        // nothing to do
        return;
    }

    final DataInput documentInput;
    if (version >= VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) {
        assert chunkSize > 0;
        assert offset < chunkSize;

        decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes);
        documentInput = new DataInput() {

            int decompressed = bytes.length;

            void fillBuffer() throws IOException {
                assert decompressed <= length;
                if (decompressed == length) {
                    throw new EOFException();
                }
                final int toDecompress = Math.min(length - decompressed, chunkSize);
                decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes);
                decompressed += toDecompress;
            }

            @Override
            public byte readByte() throws IOException {
                if (bytes.length == 0) {
                    fillBuffer();
                }
                --bytes.length;
                return bytes.bytes[bytes.offset++];
            }

            @Override
            public void readBytes(byte[] b, int offset, int len) throws IOException {
                while (len > bytes.length) {
                    System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length);
                    len -= bytes.length;
                    offset += bytes.length;
                    fillBuffer();
                }
                System.arraycopy(bytes.bytes, bytes.offset, b, offset, len);
                bytes.offset += len;
                bytes.length -= len;
            }

        };
    } else {
        final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
        decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
        assert bytes.length == length;
        documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
    }

    for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) {
        final long infoAndBits = documentInput.readVLong();
        final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
        final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);

        final int bits = (int) (infoAndBits & TYPE_MASK);
        assert bits <= NUMERIC_DOUBLE : "bits=" + Integer.toHexString(bits);

        //get restricted
        FieldVisibility cv = RestrictedStoredFieldVisitor.EMPTY;
        boolean isRestricted = documentInput.readByte() == 1;
        if (isRestricted) {
            int cv_length = documentInput.readVInt();
            byte[] cv_bytes = new byte[cv_length];
            documentInput.readBytes(cv_bytes, 0, cv_length);
            cv = new FieldVisibility(cv_bytes);
        }

        RestrictedStoredFieldVisitor restrictedStoredFieldVisitor = DelegatingRestrictedFieldVisitor
                .wrap(visitor);
        if (evaluate(cv)) {
            switch (restrictedStoredFieldVisitor.needsField(fieldInfo, cv)) {
            case YES:
                readField(documentInput, restrictedStoredFieldVisitor, fieldInfo, bits, cv);
                break;
            case NO:
                skipField(documentInput, bits, cv);
                break;
            case STOP:
                return;
            }
        } else {
            skipField(documentInput, bits, cv);
        }
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java

License:Apache License

private static BytesRef readBytesRef(IndexInput in) throws IOException {
    BytesRef bytes = new BytesRef();
    bytes.length = in.readVInt();//from   w w  w  .jav  a 2s.co  m
    bytes.bytes = new byte[bytes.length];
    in.readBytes(bytes.bytes, 0, bytes.length);
    return bytes;
}

From source file:com.rondhuit.w2v.lucene.LuceneIndexCorpus.java

License:Apache License

@Override
public void learnVocab() throws IOException {
    super.learnVocab();

    final String field = ((LuceneIndexConfig) config).getField();
    final Terms terms = MultiFields.getTerms(reader, field);
    final BytesRef maxTerm = terms.getMax();
    final BytesRef minTerm = terms.getMin();
    Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    topDocs = searcher.search(q, Integer.MAX_VALUE);

    TermsEnum termsEnum = null;/* ww  w. j  a  v a 2 s.  com*/
    termsEnum = terms.iterator(termsEnum);

    termsEnum.seekCeil(new BytesRef());
    BytesRef term = termsEnum.term();
    while (term != null) {
        int p = addWordToVocab(term.utf8ToString());
        vocab[p].setCn((int) termsEnum.totalTermFreq());
        term = termsEnum.next();
    }
}

From source file:com.shaie.PhraseVsSpanQuery.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", new TokenStream() {
        final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class);
        final CharTermAttribute term = addAttribute(CharTermAttribute.class);
        boolean first = true, done = false;

        @Override// ww w .j av a2  s  . co m
        public boolean incrementToken() throws IOException {
            if (done) {
                return false;
            }
            if (first) {
                term.setEmpty().append("a");
                pos.setPositionIncrement(1);
                first = false;
            } else {
                term.setEmpty().append("b");
                pos.setPositionIncrement(0);
                done = true;
            }
            return true;
        }
    }));
    writer.addDocument(doc);
    writer.close();

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final LeafReader ar = reader.leaves().get(0).reader();
    final TermsEnum te = ar.terms("f").iterator();
    BytesRef scratch = new BytesRef();
    while ((scratch = te.next()) != null) {
        System.out.println(scratch.utf8ToString());
        final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString()));
        System.out.println("  doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition());
    }

    System.out.println();

    // try a phrase query with a slop
    final PhraseQuery pqNoSlop = buildPhraseQuery(0);
    System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits);

    final PhraseQuery pqSlop1 = buildPhraseQuery(1);
    System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits);

    final PhraseQuery pqSlop3 = buildPhraseQuery(3);
    System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits);

    final SpanNearQuery snqUnOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            false);
    System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = "
            + searcher.search(snqUnOrdered, 10).totalHits);

    final SpanNearQuery snqOrdered = new SpanNearQuery(
            new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1,
            true);
    System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = "
            + searcher.search(snqOrdered, 10).totalHits);

    reader.close();
}

From source file:com.sindicetech.siren.search.node.NodeConstantScoreAutoRewrite.java

License:Open Source License

@Override
public Query rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {

    // Disabled cutoffs
    final int docCountCutoff = Integer.MAX_VALUE;
    final int termCountLimit = Integer.MAX_VALUE;

    final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
    this.collectTerms(reader, query, col);
    final int size = col.pendingTerms.size();

    if (col.hasCutOff) {
        return MultiNodeTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
    } else if (size == 0) {
        return this.getTopLevelQuery(query);
    } else {//from   w w  w.  j ava2  s.c  om
        final NodeBooleanQuery bq = this.getTopLevelQuery(query);
        final BytesRefHash pendingTerms = col.pendingTerms;
        final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            // docFreq is not used for constant score here, we pass 1
            // to explicitely set a fake value, so it's not calculated
            this.addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f,
                    col.array.termState[pos]);
        }
        // Strip scores
        final NodeQuery result = new NodeConstantScoreQuery(bq);
        result.setBoost(query.getBoost());
        return result;
    }
}

From source file:com.sindicetech.siren.search.node.NodeScoringRewrite.java

License:Open Source License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final Q result = this.getTopLevelQuery(query);
    final ParallelArraysTermCollector col = new ParallelArraysTermCollector();
    this.collectTerms(reader, query, col);

    final int size = col.terms.size();
    if (size > 0) {
        final int sort[] = col.terms.sort(col.termsEnum.getComparator());
        final float[] boost = col.array.boost;
        final TermContext[] termStates = col.array.termState;
        for (int i = 0; i < size; i++) {
            final int pos = sort[i];
            final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
            assert reader.docFreq(term) == termStates[pos].docFreq();
            this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos],
                    termStates[pos]);// ww w .  j  a v  a2 s.  co m
        }
    }
    return result;
}