Example usage for org.apache.lucene.util IntsRef IntsRef

List of usage examples for org.apache.lucene.util IntsRef IntsRef

Introduction

In this page you can find the example usage for org.apache.lucene.util IntsRef IntsRef.

Prototype

public IntsRef() 

Source Link

Document

Create a IntsRef with #EMPTY_INTS

Usage

From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java

License:Apache License

@Override
public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
    AtomicReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    FSTBytesAtomicFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker());
    if (terms == null) {
        data = FSTBytesAtomicFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.getMemorySizeInBytes());
        return data;
    }/* ww w  .j  av a2  s .c  o  m*/
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>(
            INPUT_TYPE.BYTE1, outputs);
    final IntsRef scratch = new IntsRef();

    final long numTerms;
    if (regex == null && frequency == null) {
        numTerms = terms.size();
    } else {
        numTerms = -1;
    }
    final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat(
            "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
    OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio);
    boolean success = false;
    try {

        // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support
        // empty strings twice. ie. them merge fails for long output.
        TermsEnum termsEnum = filter(terms, reader);
        DocsEnum docsEnum = null;
        for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
            final long termOrd = builder.nextOrdinal();
            assert termOrd > 0;
            fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd);
            docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                builder.addDoc(docId);
            }
        }

        FST<Long> fst = fstBuilder.finish();

        final Ordinals ordinals = builder.build(fieldDataType.getSettings());

        data = new FSTBytesAtomicFieldData(fst, ordinals);
        success = true;
        return data;
    } finally {
        if (success) {
            estimator.afterLoad(null, data.getMemorySizeInBytes());
        }
        builder.close();
    }
}

From source file:org.meresco.lucene.search.MerescoTaxonomyFacetCounts.java

License:Open Source License

private final void count(List<MatchingDocs> matchingDocs) throws IOException {
    IntsRef scratch = new IntsRef();
    OrdinalsReader.OrdinalsSegmentReader[] ordsReaders = new OrdinalsReader.OrdinalsSegmentReader[this.ordinalsReaders
            .size()];/*from w  ww  .  jav  a2 s  .co  m*/
    for (MatchingDocs hits : matchingDocs) {
        for (int i = 0; i < ordsReaders.length; i++) {
            ordsReaders[i] = this.ordinalsReaders.get(i).getReader(hits.context);
        }
        DocIdSetIterator docs = hits.bits.iterator();
        int doc;
        while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            for (OrdinalsReader.OrdinalsSegmentReader ords : ordsReaders) {
                ords.get(doc, scratch);
                for (int i = 0; i < scratch.length; i++) {
                    values[scratch.ints[scratch.offset + i]]++;
                }
            }
        }
    }

    rollup();
}

From source file:stemmer.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

    IntsRef scratch = new IntsRef();
    for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) {
        Util.toUTF32(entry.getKey(), scratch);
        List<Character> entries = entry.getValue();
        IntsRef output = new IntsRef(entries.size());
        for (Character c : entries) {
            output.ints[output.length++] = c;
        }//from ww  w  .j ava2  s  .co m
        builder.add(scratch, output);
    }
    return builder.finish();
}

From source file:stemmer.Dictionary.java

License:Apache License

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
    Map<String, String> mappings = new TreeMap<>();

    for (int i = 0; i < num; i++) {
        String line = reader.readLine();
        String parts[] = line.split("\\s+");
        if (parts.length != 3) {
            throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
        }//from w  w w.  j av  a  2s . com
        if (mappings.put(parts[1], parts[2]) != null) {
            throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
        }
    }

    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRef scratchInts = new IntsRef();
    for (Map.Entry<String, String> entry : mappings.entrySet()) {
        Util.toUTF16(entry.getKey(), scratchInts);
        builder.add(scratchInts, new CharsRef(entry.getValue()));
    }

    return builder.finish();
}

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Reads the dictionary file through the provided InputStreams, building up the words map
 *
 * @param dictionaries InputStreams to read the dictionary file through
 * @param decoder CharsetDecoder used to decode the contents of the file
 * @throws IOException Can be thrown while reading from the file
 *//*w w  w  .j  a  va 2s .  c o  m*/
private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
        throws IOException {
    BytesRef flagsScratch = new BytesRef();
    IntsRef scratchInts = new IntsRef();

    StringBuilder sb = new StringBuilder();

    File unsorted = File.createTempFile("unsorted", "dat", tempDir);
    ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
    boolean success = false;
    try {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

            while ((line = lines.readLine()) != null) {
                line = unescapeEntry(line);
                if (needsInputCleaning) {
                    int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        success = true;
    } finally {
        if (success) {
            IOUtils.close(writer);
        } else {
            IOUtils.closeWhileHandlingException(writer);
        }
    }
    File sorted = File.createTempFile("sorted", "dat", tempDir);

    OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
        BytesRef scratch1 = new BytesRef();
        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;

            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }

            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;

            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }

            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    sorter.sort(unsorted, sorted);
    unsorted.delete();

    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    BytesRef scratchLine = new BytesRef();

    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently

    String currentEntry = null;
    IntsRef currentOrds = new IntsRef();

    String line;
    while (reader.read(scratchLine)) {
        line = scratchLine.utf8ToString();
        String entry;
        char wordForm[];

        int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
        if (flagSep == -1) {
            wordForm = NOFLAGS;
            entry = line;
        } else {
            // note, there can be comments (morph description) after a flag.
            // we should really look for any whitespace: currently just tab and space
            int end = line.indexOf('\t', flagSep);
            if (end == -1)
                end = line.length();
            int end2 = line.indexOf(' ', flagSep);
            if (end2 == -1)
                end2 = line.length();
            end = Math.min(end, end2);

            String flagPart = line.substring(flagSep + 1, end);
            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }

            wordForm = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(wordForm);
            entry = line.substring(0, flagSep);
        }

        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
        if (cmp < 0) {
            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
        } else {
            encodeFlags(flagsScratch, wordForm);
            int ord = flagLookup.add(flagsScratch);
            if (ord < 0) {
                // already exists in our hash
                ord = (-ord) - 1;
            }
            // finalize current entry, and switch "current" if necessary
            if (cmp > 0 && currentEntry != null) {
                Util.toUTF32(currentEntry, scratchInts);
                words.add(scratchInts, currentOrds);
            }
            // swap current
            if (cmp > 0 || currentEntry == null) {
                currentEntry = entry;
                currentOrds = new IntsRef(); // must be this way
            }
            currentOrds.grow(currentOrds.length + 1);
            currentOrds.ints[currentOrds.length++] = ord;
        }
    }

    // finalize last entry
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts, currentOrds);

    reader.close();
    sorted.delete();
}