Example usage for org.apache.lucene.util IntsRefBuilder setIntAt

List of usage examples for org.apache.lucene.util IntsRefBuilder setIntAt

Introduction

In this page you can find the example usage for org.apache.lucene.util IntsRefBuilder setIntAt.

Prototype

public void setIntAt(int offset, int b) 

Source Link

Document

Set an int.

Usage

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java

License:Apache License

private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
        @Override//from  ww w  .ja  v a2 s  . c om
        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
        String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
        String[] readings = values[2].replaceAll("  *", " ").split(" ");
        String pos = values[3];

        if (segmentation.length != readings.length) {
            throw new RuntimeException("Illegal user dictionary entry " + values[0]
                    + " - the number of segmentations (" + segmentation.length + ")"
                    + " does not the match number of readings (" + readings.length + ")");
        }

        int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
        wordIdAndLength[0] = wordId;
        for (int i = 0; i < segmentation.length; i++) {
            wordIdAndLength[i + 1] = segmentation[i].length();
            data.add(readings[i] + INTERNAL_SEPARATOR + pos);
            wordId++;
        }
        // add mapping to FST
        String token = values[0];
        scratch.grow(token.length());
        scratch.setLength(token.length());
        for (int i = 0; i < token.length(); i++) {
            scratch.setIntAt(i, (int) token.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
        segmentations.add(wordIdAndLength);
        ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}