Example usage for org.apache.lucene.util IntsRefBuilder grow

List of usage examples for org.apache.lucene.util IntsRefBuilder grow

Introduction

In this page you can find the example usage for org.apache.lucene.util IntsRefBuilder grow.

Prototype

public void grow(int newLength) 

Source Link

Document

Used to grow the reference array.

Usage

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java

License:Apache License

private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
        @Override/*  w  ww . j a  v a 2s .  c o  m*/
        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
        String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
        String[] readings = values[2].replaceAll("  *", " ").split(" ");
        String pos = values[3];

        if (segmentation.length != readings.length) {
            throw new RuntimeException("Illegal user dictionary entry " + values[0]
                    + " - the number of segmentations (" + segmentation.length + ")"
                    + " does not the match number of readings (" + readings.length + ")");
        }

        int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
        wordIdAndLength[0] = wordId;
        for (int i = 0; i < segmentation.length; i++) {
            wordIdAndLength[i + 1] = segmentation[i].length();
            data.add(readings[i] + INTERNAL_SEPARATOR + pos);
            wordId++;
        }
        // add mapping to FST
        String token = values[0];
        scratch.grow(token.length());
        scratch.setLength(token.length());
        for (int i = 0; i < token.length(); i++) {
            scratch.setIntAt(i, (int) token.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
        segmentations.add(wordIdAndLength);
        ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}