List of usage examples for org.apache.lucene.util BytesRef BytesRef
public BytesRef(CharSequence text)
From source file:com.gitblit.tickets.TicketIndexer.java
License:Apache License
private void toDocField(Document doc, Lucene lucene, String value) { if (StringUtils.isEmpty(value)) { return;// w w w .ja v a2 s.co m } doc.add(new org.apache.lucene.document.Field(lucene.name(), value, TextField.TYPE_STORED)); doc.add(new SortedDocValuesField(lucene.name(), new BytesRef(value))); }
From source file:com.github.flaxsearch.testutil.Fixtures.java
License:Apache License
private static void populateIndex(Directory directory) { try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig())) { {/*from ww w.j a v a 2s.c o m*/ Document doc = new Document(); doc.add(new TextField("field2", "here is some text", Field.Store.YES)); doc.add(new StringField("field1", "value1", Field.Store.YES)); doc.add(new IntPoint("point", 2, 4)); doc.add(new IntPoint("point", 0, 1)); doc.add(new IntPoint("point", 2, 1)); doc.add(new IntPoint("point", 14, 4)); writer.addDocument(doc); // more than one segment writer.commit(); } { Document doc = new Document(); doc.add(new StringField("field1", "value2", Field.Store.YES)); doc.add(new BinaryDocValuesField("field1", new BytesRef("some bytes"))); doc.add(new TextField("field3", "this is some more text in a different field value1 value11 value12 value21", Field.Store.YES)); writer.addDocument(doc); } } catch (IOException e) { throw new RuntimeException("We're a RAMDirectory, this should never happen!"); } }
From source file:com.github.flaxsearch.util.BytesRefUtils.java
License:Apache License
private static Function<String, BytesRef> getDecoder(String type) { switch (type.toLowerCase(Locale.ROOT)) { case "base64": return s -> new BytesRef(Base64.getUrlDecoder().decode(s.getBytes(Charset.defaultCharset()))); case "utf8": return BytesRef::new; case "int": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(s), 0, builder); return builder.get(); };//from ww w . j a v a2s.c o m case "long": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.longToPrefixCoded(Long.parseLong(s), 0, builder); return builder.get(); }; case "float": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, builder); return builder.get(); }; case "double": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, builder); return builder.get(); }; default: throw new IllegalArgumentException("Unknown decoder type: " + type); } }
From source file:com.github.flaxsearch.util.ReaderManager.java
License:Apache License
default TermsEnum findTermPostings(Integer segment, String field, String term) throws IOException { Fields fields = getFields(segment);/*from www . j a v a2 s . c o m*/ Terms terms = fields.terms(field); if (terms == null) { String msg = String.format("No field %s", field); throw new WebApplicationException(msg, Response.Status.NOT_FOUND); } TermsEnum te = terms.iterator(); assert (term != null); if (!te.seekExact(new BytesRef(term))) { String msg = String.format("No term %s on field %s", term, field); throw new WebApplicationException(msg, Response.Status.NOT_FOUND); } return te; }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
List<ClassificationResult<BytesRef>> buildListFromTopDocs(IndexSearcher searcher, String categoryFieldName, TopDocs topDocs, int k) throws IOException { Map<BytesRef, Integer> classCounts = new HashMap<>(); Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs float maxScore = topDocs.getMaxScore(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { IndexableField storableField = searcher.doc(scoreDoc.doc).getField(categoryFieldName); if (storableField != null) { BytesRef cl = new BytesRef(storableField.stringValue()); //update count Integer count = classCounts.get(cl); if (count != null) { classCounts.put(cl, count + 1); } else { classCounts.put(cl, 1);//w w w . j a v a2 s . c o m } //update boost, the boost is based on the best score Double totalBoost = classBoosts.get(cl); double singleBoost = scoreDoc.score / maxScore; if (totalBoost != null) { classBoosts.put(cl, totalBoost + singleBoost); } else { classBoosts.put(cl, singleBoost); } } } List<ClassificationResult<BytesRef>> returnList = new ArrayList<>(); List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>(); int sumdoc = 0; for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) { Integer count = entry.getValue(); Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1 temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k)); sumdoc += count; } //correction if (sumdoc < k) { for (ClassificationResult<BytesRef> cr : temporaryList) { returnList.add( new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc)); } } else { returnList = temporaryList; } return returnList; }
From source file:com.github.tteofili.looseen.QueryingClassifier.java
License:Apache License
@Override public ClassificationResult<BytesRef> assignClass(String text) throws IOException { ClassificationResult<BytesRef> result = null; for (Map.Entry<String, Query> entry : queriesPerClass.entrySet()) { TopDocs search = indexSearcher.search(entry.getValue(), 1); float score; if (useCounts) { score = search.totalHits;/*from ww w. j a va 2 s. com*/ } else { score = search.getMaxScore(); } if (result == null) { result = new ClassificationResult<>(new BytesRef(entry.getKey()), score); } else if (score > result.getScore()) { result = new ClassificationResult<>(new BytesRef(entry.getKey()), score); } } return result; }
From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java
License:Apache License
void buildIndex(File indexDir, IndexWriter indexWriter) throws IOException { File[] groupsDir = indexDir.listFiles(); if (groupsDir != null) { for (File group : groupsDir) { String groupName = group.getName(); File[] posts = group.listFiles(); if (posts != null) { for (File postFile : posts) { String number = postFile.getName(); NewsPost post = parse(postFile, groupName, number); Document d = new Document(); d.add(new StringField(CATEGORY_FIELD, post.getGroup(), Field.Store.YES)); d.add(new SortedDocValuesField(CATEGORY_FIELD, new BytesRef(post.getGroup()))); d.add(new TextField(SUBJECT_FIELD, post.getSubject(), Field.Store.YES)); d.add(new TextField(BODY_FIELD, post.getBody(), Field.Store.YES)); indexWriter.addDocument(d); }/*w w w .ja v a 2s . c o m*/ } } } indexWriter.commit(); }
From source file:com.github.tteofili.looseen.TestWikipediaClassification.java
License:Apache License
private static void importWikipedia(File dump, IndexWriter indexWriter) throws Exception { long start = System.currentTimeMillis(); int count = 0; System.out.format("Importing %s...%n", dump); String title = null;/* w w w . j av a2s . co m*/ String text = null; Set<String> cats = new HashSet<>(); XMLInputFactory factory = XMLInputFactory.newInstance(); StreamSource source; if (dump.getName().endsWith(".xml")) { source = new StreamSource(dump); } else { throw new RuntimeException("can index only wikipedia XML files"); } XMLStreamReader reader = factory.createXMLStreamReader(source); while (reader.hasNext()) { if (count == Integer.MAX_VALUE) { break; } switch (reader.next()) { case XMLStreamConstants.START_ELEMENT: if ("title".equals(reader.getLocalName())) { title = reader.getElementText(); } else if (TEXT_FIELD.equals(reader.getLocalName())) { text = reader.getElementText(); Matcher matcher = pattern.matcher(text); int pos = 0; while (matcher.find(pos)) { String group = matcher.group(1); String catName = group.replaceAll("\\|\\s", "").replaceAll("\\|\\*", ""); Collections.addAll(cats, catName.split("\\|")); pos = matcher.end(); } } break; case XMLStreamConstants.END_ELEMENT: if ("page".equals(reader.getLocalName())) { Document page = new Document(); if (title != null) { page.add(new TextField(TITLE_FIELD, title, StoredField.Store.YES)); } if (text != null) { page.add(new TextField(TEXT_FIELD, text, StoredField.Store.YES)); } for (String cat : cats) { page.add(new StringField(CATEGORY_FIELD, cat, Field.Store.YES)); page.add(new SortedSetDocValuesField(CATEGORY_FIELD, new BytesRef(cat))); } indexWriter.addDocument(page); cats.clear(); count++; if (count % 100000 == 0) { indexWriter.commit(); System.out.format("Committed %d pages%n", count); } } break; } } indexWriter.commit(); long millis = System.currentTimeMillis() - start; System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000, (double) millis / count); }
From source file:com.hrhih.index.suggest.Input.java
License:Apache License
public Input(String term, long v, BytesRef payload) { this(new BytesRef(term), v, payload); }
From source file:com.hrhih.index.suggest.Input.java
License:Apache License
public Input(String term, long v, Set<BytesRef> contexts) { this(new BytesRef(term), v, null, false, contexts, true); }