List of usage examples for org.apache.lucene.index Term field
String field
To view the source code for org.apache.lucene.index Term field.
Click Source Link
From source file:RangeFilter.java
License:Apache License
/** * Returns a BitSet with true for documents which should be * permitted in search results, and false for those that should * not.//from w ww . java 2 s . c o m */ public BitSet bits(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName, ""))); try { if (enumerator.term() == null) { return bits; } boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; TermDocs termDocs = reader.termDocs(); try { do { Term term = enumerator.term(); if (term != null && term.field().equals(fieldName)) { if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) { checkLower = false; if (upperTerm != null) { int compare = upperTerm.compareTo(term.text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!includeUpper && compare == 0)) { break; } } /* we have a good term, find the docs */ termDocs.seek(enumerator.term()); while (termDocs.next()) { bits.set(termDocs.doc()); } } } else { break; } } while (enumerator.next()); } finally { termDocs.close(); } } finally { enumerator.close(); } return bits; }
From source file:CountWords3.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { String usage = "Usage:\tjava CountWords3 indexdir"; System.out.println(usage); System.exit(0);// w ww . j a v a 2 s.com } String index = args[0]; String field = "contents"; String queries = null; IndexReader reader = IndexReader.open(FSDirectory.open(new File(index))); TermEnum te = reader.terms(); int totalOccs = 0; int i = 0; boolean notlastt = te.next(); Map<String, Integer> terms = new HashMap<>(); if (removeStopWords) { File stopWordsFile = new File("./src/org/apache/lucene/demo/stopwords_long_EN.txt"); setStopWords(stopWordsFile); } while (notlastt) { Term t = te.term(); if (t.field().equals(field)) { // ignore if not desired field if (!removeTerm(t.text())) { if (!removeStopWords || !stopWords.contains(t.text())) { TermDocs td = reader.termDocs(t); int n = 0; boolean notlastd = td.next(); while (notlastd) { n += td.freq(); notlastd = td.next(); } String text = t.text(); //String text = removePunctuation(t.text()); if (useStemmer) text = porterStemming(text); Integer value; if (terms.containsKey(text)) { value = n + terms.get(text); } else { value = n; } terms.put(text, value); //System.out.println(t.text() + " " + n); totalOccs += n; ++i; } } } notlastt = te.next(); } terms = sortByComparator(terms, false); int j = 1; /*for (Map.Entry entry : terms.entrySet()) { System.out.print(entry.getKey() + " "); System.out.print(entry.getValue() + " "); System.out.println(j); j += 1; }*/ System.out.println("Distinct words: " + terms.size() + "; Word occurrences: " + totalOccs); }
From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java
private static String fromTermsToQueryText(List<Term> terms) { if (terms.size() == 0) { throw new IllegalArgumentException("Empty terms."); }// w w w . jav a2 s. c o m String query = ""; for (Term t : terms) { query += t.text(); float boost = SolrField.getFieldByName(t.field()).getBoost(); if (boost != SolrFieldBoost.DEFAULT_BOOST) { query += "^" + boost; } query += " OR "; } return query.substring(0, query.lastIndexOf(" OR ")); }
From source file:ca.gnewton.lusql.core.IndexTermFreqCache.java
License:Apache License
public IndexTermFreqCache(final IndexReader newReader, final String newFieldName, int initSize, boolean newPreload) throws IOException { setPreload(newPreload);/*from w ww . jav a2s .c o m*/ setReader(newReader); setFieldName(newFieldName); cache = new HashMap<String, Integer>(initSize); if (preload) { TermEnum te = reader.terms(); while (te.next()) { Term term = te.term(); /* System.out.println(te.term().field() + ": " + te.term().text() + ": " + reader.docFreq(term)); */ if (term.field().equals(fieldName)) { cache.put(te.term().text(), new Integer(reader.docFreq(term))); } } } }
From source file:ch.ymc.lucehbase.LucandraTermEnum.java
License:Apache License
private void loadTerms(Term skipTo) throws IOException { // chose starting term String startTerm = indexName + HBaseUtils.delimeter + HBaseUtils.createColumnName(skipTo); // this is where we stop; String endTerm = indexName + HBaseUtils.delimeter + skipTo.field().substring(0, skipTo.field().length() - 1) + new Character((char) (skipTo.field().toCharArray()[skipTo.field().length() - 1] + 1)); // ; if ((!skipTo.equals(initTerm) || termPosition == 0) && termCache != null) { termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey()); } else {//from w ww .j a v a 2 s . c o m termDocFreqBuffer = null; } if (termDocFreqBuffer != null) { termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; logger.debug("Found " + startTerm + " in cache"); return; } else if (chunkCount > 1 && actualInitSize < maxChunkSize) { termBuffer = new Term[] {}; termPosition = 0; return; // done! } chunkCount++; // The first time we grab just a few keys int count = maxInitSize; // otherwise we grab all the rest of the keys if (initTerm != null) { count = maxChunkSize; startTerm = indexName + HBaseUtils.delimeter + HBaseUtils.createColumnName(initTerm); } long start = System.currentTimeMillis(); termDocFreqBuffer = new TreeMap<Term, NavigableMap<byte[], byte[]>>(); // Get all columns Scan scan = new Scan(startTerm.getBytes(), endTerm.getBytes()); scan.addFamily(HBaseUtils.termVecColumnFamily); ResultScanner scanner = table.getScanner(scan); actualInitSize = 0; for (Result result : scanner) { ++actualInitSize; NavigableMap<byte[], byte[]> columns = result.getFamilyMap(HBaseUtils.termVecColumnFamily); byte[] row = result.getRow(); String rowString = new String(row); // term keys look like wikipedia/body/wiki String termStr = rowString .substring(rowString.indexOf(HBaseUtils.delimeter) + HBaseUtils.delimeter.length()); Term term; try { term = HBaseUtils.parseTerm(termStr.getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } logger.debug(termStr + " has " + columns.size()); //check for tombstone keys if (columns.size() > 0) { termDocFreqBuffer.put(term, columns); } } if (!termDocFreqBuffer.isEmpty()) { initTerm = termDocFreqBuffer.lastKey(); } // term to start with next time logger.debug("Found " + actualInitSize + " keys in range:" + startTerm + " to " + endTerm + " in " + (System.currentTimeMillis() - start) + "ms"); // add a final key (excluded in submap below) termDocFreqBuffer.put(finalTerm, null); // put in cache for (Term termKey : termDocFreqBuffer.keySet()) { if (termCache == null) { termCache = termDocFreqBuffer; } else { termCache.putAll(termDocFreqBuffer); } indexReader.addTermEnumCache(termKey, this); } // cache the initial term too indexReader.addTermEnumCache(skipTo, this); termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; long end = System.currentTimeMillis(); logger.debug("loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms"); }
From source file:com.barchart.feed.ddf.resolver.provider.CodecHelper.java
License:BSD License
/** convert instrument into lucene document */ static Document instrumentEncode(final Instrument instrument) { final Document doc = new Document(); {//from w w w . ja va 2s.co m final Term term = getKeyTerm(instrument); final String name = term.field(); final String value = term.text(); /** store; do not index */ final Field keyField = new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.add(keyField); } { final String name = CodecHelper.FIELD_INST_BODY; final String value = fullText(instrument); /** index; do not store */ final Field bodyField = new Field(name, value, Field.Store.NO, Field.Index.ANALYZED); doc.add(bodyField); } // TODO If this ever need to be used, then some hardcoding // for iterating over the instrument fields will need to be written. // Currently, this isn't being used. // for (final Tag<?> field : CodecHelper.BASE) { // // final String name = field.name(); // final String value = encode(field, instrument.get(field)); // // /** store; do not index */ // final Field baseField = new Field(name, value, Field.Store.YES, // Field.Index.NO); // // doc.add(baseField); // // } return doc; }
From source file:com.basistech.lucene.tools.LuceneQueryTool.java
License:Apache License
private void runQuery(String queryString, final PrintStream out) throws IOException, org.apache.lucene.queryparser.classic.ParseException { final IndexSearcher searcher = new IndexSearcher(indexReader); docsPrinted = 0;//ww w . ja v a 2 s .c o m Query query; if (queryString == null) { query = new MatchAllDocsQuery(); } else { if (!queryString.contains(":") && defaultField == null) { throw new RuntimeException("query has no ':' and no query-field defined"); } QueryParser queryParser = new QueryParser(defaultField, analyzer); queryParser.setLowercaseExpandedTerms(false); query = queryParser.parse(queryString).rewrite(indexReader); Set<Term> terms = Sets.newHashSet(); query.createWeight(searcher, false).extractTerms(terms); List<String> invalidFieldNames = Lists.newArrayList(); for (Term term : terms) { if (!allFieldNames.contains(term.field())) { invalidFieldNames.add(term.field()); } } if (!invalidFieldNames.isEmpty()) { throw new RuntimeException("Invalid field names: " + invalidFieldNames); } } final Set<String> fieldSet = Sets.newHashSet(fieldNames); // use a Collector instead of TopDocs for memory efficiency, especially // for the %all query class MyCollector extends SimpleCollector { private Scorer scorer; private long totalHits; private int docBase; @Override protected void doSetNextReader(LeafReaderContext context) throws IOException { docBase = context.docBase; } @Override public void collect(int id) throws IOException { totalHits++; if (docsPrinted >= outputLimit) { return; } id += docBase; Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet); boolean passedFilter = regexField == null; if (regexField != null) { String value = doc.get(regexField); if (value != null && regex.matcher(value).matches()) { passedFilter = true; } } if (passedFilter) { float score = scorer.score(); printDocument(doc, id, score, out); } } @Override public boolean needsScores() { return true; } @Override public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } } MyCollector collector = new MyCollector(); searcher.search(query, collector); if (showHits) { out.println("totalHits: " + collector.totalHits); out.println(); } }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
@Deprecated public String searchQueryPartsFill(String indexName, String docType, String query, Analyzer analyzer, Map<String, String> multiQueryParts) throws Exception { System.err.println(/*from w w w .j av a 2s . c om*/ "\n\n\n************ Stop using this method and instead use the following method. ******************\n" + "public String searchQueryPartsFill( Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception\n\n\n"); String defaultField = "BIZOSYSNONE"; QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Query q = qp.parse(query); Set<Term> terms = new HashSet<Term>(); q.extractTerms(terms); int index = 0; Map<String, String> termsL = new HashMap<String, String>(); if (!"*".equals(docType)) docType = this.sConf.getDocumentTypeCodes().getCode(docType).toString(); for (Term term : terms) { String fieldName = term.field(); if (defaultField.equals(fieldName)) fieldName = "*"; else if ("*".equals(fieldName)) fieldName = "*"; else fieldName = this.sConf.getFieldTypeCodes().getCode(term.field()).toString(); String fieldText = term.text(); String expandedTerm = docType + "|" + fieldName + "|" + Hashing.hash(fieldText) + "|*|*"; String lhs = indexName + ":" + index; multiQueryParts.put(lhs, expandedTerm); String fld = term.field(); if (defaultField.equals(fld)) termsL.put(fieldText, lhs); else termsL.put(term.field() + ":" + fieldText, lhs); index++; } //Replace the intermediate ones for (String term : termsL.keySet()) { String caseQuery = null; for (int i = 0; i < 3; i++) { switch (i) { case 0: caseQuery = query; break; case 1: caseQuery = query.toLowerCase(); break; case 2: caseQuery = query.toUpperCase(); break; } term = term.replace(defaultField + ":", ""); int caseTermIndex = caseQuery.indexOf(term + " "); if (caseTermIndex >= 0) { query = query.substring(0, caseTermIndex) + termsL.get(term) + query.substring(caseTermIndex + term.length()); } } } //Replace the last one for (String term : termsL.keySet()) { String caseQuery = null; for (int j = 0; j < 3; j++) { switch (j) { case 0: caseQuery = query; break; case 1: caseQuery = query.toLowerCase(); break; case 2: caseQuery = query.toUpperCase(); break; } int caseTermIndex = caseQuery.indexOf(term); if (caseTermIndex >= 0) { query = query.substring(0, caseTermIndex) + termsL.get(term) + query.substring(caseTermIndex + term.length()); break; } } } return query; }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
public String searchQueryPartsFill(Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception { String defaultField = "BIZOSYSNONE"; Map<Integer, String> explodedParts = new HashMap<Integer, String>(); for (String qKey : partsToAnalyze) { QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Set<Term> terms = new HashSet<Term>(); Query q = qp.parse(multiQueryParts.get(qKey)); q.extractTerms(terms);/*from w ww.j av a 2 s . c o m*/ int index = 1; explodedParts.clear(); for (Term term : terms) { String fieldName = term.field(); String fieldText = term.text(); String docType = "*"; String fieldType = "*"; int docAndFieldBreakPointIndex = fieldName.indexOf('/'); if (-1 == docAndFieldBreakPointIndex) { docType = fieldName; } else { docType = fieldName.substring(0, docAndFieldBreakPointIndex); fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1); } if (docType.equals(defaultField)) docType = "*"; else if (!("*".equals(docType) || "".equals(docType))) { docType = sConf.getDocumentTypeCodes().getCode(docType).toString(); } if (fieldType.equals(defaultField)) fieldType = "*"; else if (!("*".equals(fieldType) || "".equals(fieldType))) { fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString(); } String expandedTerm = docType + "|" + fieldType + "|" + Hashing.hash(fieldText) + "|*|*"; explodedParts.put(index, expandedTerm); index++; } if (explodedParts.size() > 1) { multiQueryParts.remove(qKey); StringBuilder sb = new StringBuilder(); boolean isFirst = true; for (Integer seq : explodedParts.keySet()) { String explodedKey = qKey + seq.toString(); multiQueryParts.put(explodedKey, explodedParts.get(seq)); if (isFirst) isFirst = false; else { if (isAllWords) sb.append(" AND "); else sb.append(" OR "); } sb.append(explodedKey); } multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) "); } else { multiQueryParts.put(qKey, explodedParts.get(index - 1)); } } return multiQuery; }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
public String searchQueryPartsFillWithMetadata(Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception { String defaultField = "BIZOSYSNONE"; Map<Integer, String> explodedParts = new HashMap<Integer, String>(); for (String qKey : partsToAnalyze) { QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Set<Term> terms = new HashSet<Term>(); Query q = qp.parse(multiQueryParts.get(qKey)); q.extractTerms(terms);/*from ww w .j a v a2s .co m*/ int index = 1; explodedParts.clear(); for (Term term : terms) { String fieldName = term.field(); String searchword = term.text(); String docType = "*"; String fieldType = "*"; String payload = "*"; int docAndFieldBreakPointIndex = fieldName.indexOf('/'); if (-1 == docAndFieldBreakPointIndex) { docType = fieldName; } else { docType = fieldName.substring(0, docAndFieldBreakPointIndex); fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1); int fieldAndPayloadBreakPointIndex = fieldType.indexOf('/'); if (fieldAndPayloadBreakPointIndex > 0) { fieldType = fieldType.substring(0, fieldAndPayloadBreakPointIndex); payload = fieldType.substring(fieldAndPayloadBreakPointIndex + 1); } } if (docType.equals(defaultField)) docType = "*"; else if (!("*".equals(docType) || "".equals(docType))) { docType = sConf.getDocumentTypeCodes().getCode(docType).toString(); } if (fieldType.equals(defaultField)) fieldType = "*"; else if (!("*".equals(fieldType) || "".equals(fieldType))) { fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString(); } String expandedTerm = docType + "|" + fieldType + "|" + payload + "|" + Hashing.hash(searchword) + "|*|*"; explodedParts.put(index, expandedTerm); index++; } if (explodedParts.size() > 1) { multiQueryParts.remove(qKey); StringBuilder sb = new StringBuilder(); boolean isFirst = true; for (Integer seq : explodedParts.keySet()) { String explodedKey = qKey + seq.toString(); multiQueryParts.put(explodedKey, explodedParts.get(seq)); if (isFirst) isFirst = false; else { if (isAllWords) sb.append(" AND "); else sb.append(" OR "); } sb.append(explodedKey); } multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) "); } else { multiQueryParts.put(qKey, explodedParts.get(index - 1)); } } return multiQuery; }