List of usage examples for org.apache.lucene.index Term text
public final String text()
From source file:RangeFilter.java
License:Apache License
/** * Returns a BitSet with true for documents which should be * permitted in search results, and false for those that should * not.//ww w .j a va2 s . co m */ public BitSet bits(IndexReader reader) throws IOException { BitSet bits = new BitSet(reader.maxDoc()); TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm)) : reader.terms(new Term(fieldName, ""))); try { if (enumerator.term() == null) { return bits; } boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; TermDocs termDocs = reader.termDocs(); try { do { Term term = enumerator.term(); if (term != null && term.field().equals(fieldName)) { if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) { checkLower = false; if (upperTerm != null) { int compare = upperTerm.compareTo(term.text()); /* if beyond the upper term, or is exclusive and * this is equal to the upper term, break out */ if ((compare < 0) || (!includeUpper && compare == 0)) { break; } } /* we have a good term, find the docs */ termDocs.seek(enumerator.term()); while (termDocs.next()) { bits.set(termDocs.doc()); } } } else { break; } } while (enumerator.next()); } finally { termDocs.close(); } } finally { enumerator.close(); } return bits; }
From source file:CountWords3.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { String usage = "Usage:\tjava CountWords3 indexdir"; System.out.println(usage); System.exit(0);/*from ww w. ja va 2s . c o m*/ } String index = args[0]; String field = "contents"; String queries = null; IndexReader reader = IndexReader.open(FSDirectory.open(new File(index))); TermEnum te = reader.terms(); int totalOccs = 0; int i = 0; boolean notlastt = te.next(); Map<String, Integer> terms = new HashMap<>(); if (removeStopWords) { File stopWordsFile = new File("./src/org/apache/lucene/demo/stopwords_long_EN.txt"); setStopWords(stopWordsFile); } while (notlastt) { Term t = te.term(); if (t.field().equals(field)) { // ignore if not desired field if (!removeTerm(t.text())) { if (!removeStopWords || !stopWords.contains(t.text())) { TermDocs td = reader.termDocs(t); int n = 0; boolean notlastd = td.next(); while (notlastd) { n += td.freq(); notlastd = td.next(); } String text = t.text(); //String text = removePunctuation(t.text()); if (useStemmer) text = porterStemming(text); Integer value; if (terms.containsKey(text)) { value = n + terms.get(text); } else { value = n; } terms.put(text, value); //System.out.println(t.text() + " " + n); totalOccs += n; ++i; } } } notlastt = te.next(); } terms = sortByComparator(terms, false); int j = 1; /*for (Map.Entry entry : terms.entrySet()) { System.out.print(entry.getKey() + " "); System.out.print(entry.getValue() + " "); System.out.println(j); j += 1; }*/ System.out.println("Distinct words: " + terms.size() + "; Word occurrences: " + totalOccs); }
From source file:au.edu.unimelb.csse.exp.GenerateQueries.java
License:Apache License
private void getAllTerms() throws IOException { final TermEnum terms = reader.getIndexReader().terms(); boolean next = terms.next(); while (next != false) { Term term = terms.term(); if (reader.docFreq(term) > MIN_DOC_FREQ) { String original = term.text().trim(); if (original.equals("S")) { textLabels.add(original); next = terms.next();// w w w . j a v a 2 s. co m continue; } if (original.length() < 2) { next = terms.next(); continue; } if (original.length() == 2) { if (original.toUpperCase().equals(original)) { textLabels.add(original); } else { next = terms.next(); continue; } } int uppercases = 0; for (int i = 0; i < original.length() && uppercases <= original.length() / 2; i++) { if (Character.isUpperCase(original.charAt(i))) { uppercases++; } } if (uppercases > original.length() / 2) { textLabels.add(original); } } next = terms.next(); } }
From source file:au.edu.unimelb.csse.IndexUtils.java
License:Apache License
public int findDocFreq(String term) throws IOException { TermEnum terms = reader.terms();/*w ww.j av a 2s. co m*/ boolean next = terms.next(); while (next) { Term t = terms.term(); if (t.text().equals(term)) { return reader.docFreq(t); } next = terms.next(); } return 0; }
From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java
private static String fromTermsToQueryText(List<Term> terms) { if (terms.size() == 0) { throw new IllegalArgumentException("Empty terms."); }/*from w ww .j a va2s . co m*/ String query = ""; for (Term t : terms) { query += t.text(); float boost = SolrField.getFieldByName(t.field()).getBoost(); if (boost != SolrFieldBoost.DEFAULT_BOOST) { query += "^" + boost; } query += " OR "; } return query.substring(0, query.lastIndexOf(" OR ")); }
From source file:com.barchart.feed.ddf.resolver.provider.CodecHelper.java
License:BSD License
/** convert instrument into lucene document */ static Document instrumentEncode(final Instrument instrument) { final Document doc = new Document(); {// ww w .j a v a 2s .co m final Term term = getKeyTerm(instrument); final String name = term.field(); final String value = term.text(); /** store; do not index */ final Field keyField = new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.add(keyField); } { final String name = CodecHelper.FIELD_INST_BODY; final String value = fullText(instrument); /** index; do not store */ final Field bodyField = new Field(name, value, Field.Store.NO, Field.Index.ANALYZED); doc.add(bodyField); } // TODO If this ever need to be used, then some hardcoding // for iterating over the instrument fields will need to be written. // Currently, this isn't being used. // for (final Tag<?> field : CodecHelper.BASE) { // // final String name = field.name(); // final String value = encode(field, instrument.get(field)); // // /** store; do not index */ // final Field baseField = new Field(name, value, Field.Store.YES, // Field.Index.NO); // // doc.add(baseField); // // } return doc; }
From source file:com.bizosys.hsearch.kv.impl.KVDocIndexer.java
License:Apache License
public String parseQuery(Analyzer analyzer, String docType, String fieldType, String query) throws IOException, ParseException, InstantiationException { String docTypeCode = "*".equals(docType) ? "*" : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString(); String fldTypeCode = "*".equals(fieldType) ? "*" : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString(); QueryParser qp = new QueryParser(Version.LUCENE_36, "K", analyzer); Query q = null;/* w w w . j a v a 2s .c o m*/ try { q = qp.parse(query); } catch (org.apache.lucene.queryParser.ParseException ex) { throw new ParseException(ex.getMessage(), 0); } Set<Term> terms = new HashSet<Term>(); q.extractTerms(terms); StringBuilder allWords = null; for (Term term : terms) { String fieldText = term.text(); if (null == allWords) { allWords = new StringBuilder("{"); allWords.append(Hashing.hash(fieldText)); } else { allWords.append(',').append(Hashing.hash(fieldText)); } } allWords.append('}'); StringBuilder queryBuilder = new StringBuilder(1024); queryBuilder.append(docTypeCode); queryBuilder.append('|'); queryBuilder.append(fldTypeCode); queryBuilder.append('|'); queryBuilder.append('*'); queryBuilder.append('|'); queryBuilder.append(allWords.toString()); queryBuilder.append("|*|*"); return queryBuilder.toString(); }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
@Deprecated public String searchQueryPartsFill(String indexName, String docType, String query, Analyzer analyzer, Map<String, String> multiQueryParts) throws Exception { System.err.println(/*from w w w. j a va2 s.c o m*/ "\n\n\n************ Stop using this method and instead use the following method. ******************\n" + "public String searchQueryPartsFill( Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception\n\n\n"); String defaultField = "BIZOSYSNONE"; QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Query q = qp.parse(query); Set<Term> terms = new HashSet<Term>(); q.extractTerms(terms); int index = 0; Map<String, String> termsL = new HashMap<String, String>(); if (!"*".equals(docType)) docType = this.sConf.getDocumentTypeCodes().getCode(docType).toString(); for (Term term : terms) { String fieldName = term.field(); if (defaultField.equals(fieldName)) fieldName = "*"; else if ("*".equals(fieldName)) fieldName = "*"; else fieldName = this.sConf.getFieldTypeCodes().getCode(term.field()).toString(); String fieldText = term.text(); String expandedTerm = docType + "|" + fieldName + "|" + Hashing.hash(fieldText) + "|*|*"; String lhs = indexName + ":" + index; multiQueryParts.put(lhs, expandedTerm); String fld = term.field(); if (defaultField.equals(fld)) termsL.put(fieldText, lhs); else termsL.put(term.field() + ":" + fieldText, lhs); index++; } //Replace the intermediate ones for (String term : termsL.keySet()) { String caseQuery = null; for (int i = 0; i < 3; i++) { switch (i) { case 0: caseQuery = query; break; case 1: caseQuery = query.toLowerCase(); break; case 2: caseQuery = query.toUpperCase(); break; } term = term.replace(defaultField + ":", ""); int caseTermIndex = caseQuery.indexOf(term + " "); if (caseTermIndex >= 0) { query = query.substring(0, caseTermIndex) + termsL.get(term) + query.substring(caseTermIndex + term.length()); } } } //Replace the last one for (String term : termsL.keySet()) { String caseQuery = null; for (int j = 0; j < 3; j++) { switch (j) { case 0: caseQuery = query; break; case 1: caseQuery = query.toLowerCase(); break; case 2: caseQuery = query.toUpperCase(); break; } int caseTermIndex = caseQuery.indexOf(term); if (caseTermIndex >= 0) { query = query.substring(0, caseTermIndex) + termsL.get(term) + query.substring(caseTermIndex + term.length()); break; } } } return query; }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
public String searchQueryPartsFill(Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception { String defaultField = "BIZOSYSNONE"; Map<Integer, String> explodedParts = new HashMap<Integer, String>(); for (String qKey : partsToAnalyze) { QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Set<Term> terms = new HashSet<Term>(); Query q = qp.parse(multiQueryParts.get(qKey)); q.extractTerms(terms);//from ww w . java2 s . co m int index = 1; explodedParts.clear(); for (Term term : terms) { String fieldName = term.field(); String fieldText = term.text(); String docType = "*"; String fieldType = "*"; int docAndFieldBreakPointIndex = fieldName.indexOf('/'); if (-1 == docAndFieldBreakPointIndex) { docType = fieldName; } else { docType = fieldName.substring(0, docAndFieldBreakPointIndex); fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1); } if (docType.equals(defaultField)) docType = "*"; else if (!("*".equals(docType) || "".equals(docType))) { docType = sConf.getDocumentTypeCodes().getCode(docType).toString(); } if (fieldType.equals(defaultField)) fieldType = "*"; else if (!("*".equals(fieldType) || "".equals(fieldType))) { fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString(); } String expandedTerm = docType + "|" + fieldType + "|" + Hashing.hash(fieldText) + "|*|*"; explodedParts.put(index, expandedTerm); index++; } if (explodedParts.size() > 1) { multiQueryParts.remove(qKey); StringBuilder sb = new StringBuilder(); boolean isFirst = true; for (Integer seq : explodedParts.keySet()) { String explodedKey = qKey + seq.toString(); multiQueryParts.put(explodedKey, explodedParts.get(seq)); if (isFirst) isFirst = false; else { if (isAllWords) sb.append(" AND "); else sb.append(" OR "); } sb.append(explodedKey); } multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) "); } else { multiQueryParts.put(qKey, explodedParts.get(index - 1)); } } return multiQuery; }
From source file:com.bizosys.unstructured.IndexSearcher.java
License:Apache License
public String searchQueryPartsFillWithMetadata(Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception { String defaultField = "BIZOSYSNONE"; Map<Integer, String> explodedParts = new HashMap<Integer, String>(); for (String qKey : partsToAnalyze) { QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer); Set<Term> terms = new HashSet<Term>(); Query q = qp.parse(multiQueryParts.get(qKey)); q.extractTerms(terms);/*from ww w . j a va 2 s . c om*/ int index = 1; explodedParts.clear(); for (Term term : terms) { String fieldName = term.field(); String searchword = term.text(); String docType = "*"; String fieldType = "*"; String payload = "*"; int docAndFieldBreakPointIndex = fieldName.indexOf('/'); if (-1 == docAndFieldBreakPointIndex) { docType = fieldName; } else { docType = fieldName.substring(0, docAndFieldBreakPointIndex); fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1); int fieldAndPayloadBreakPointIndex = fieldType.indexOf('/'); if (fieldAndPayloadBreakPointIndex > 0) { fieldType = fieldType.substring(0, fieldAndPayloadBreakPointIndex); payload = fieldType.substring(fieldAndPayloadBreakPointIndex + 1); } } if (docType.equals(defaultField)) docType = "*"; else if (!("*".equals(docType) || "".equals(docType))) { docType = sConf.getDocumentTypeCodes().getCode(docType).toString(); } if (fieldType.equals(defaultField)) fieldType = "*"; else if (!("*".equals(fieldType) || "".equals(fieldType))) { fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString(); } String expandedTerm = docType + "|" + fieldType + "|" + payload + "|" + Hashing.hash(searchword) + "|*|*"; explodedParts.put(index, expandedTerm); index++; } if (explodedParts.size() > 1) { multiQueryParts.remove(qKey); StringBuilder sb = new StringBuilder(); boolean isFirst = true; for (Integer seq : explodedParts.keySet()) { String explodedKey = qKey + seq.toString(); multiQueryParts.put(explodedKey, explodedParts.get(seq)); if (isFirst) isFirst = false; else { if (isAllWords) sb.append(" AND "); else sb.append(" OR "); } sb.append(explodedKey); } multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) "); } else { multiQueryParts.put(qKey, explodedParts.get(index - 1)); } } return multiQuery; }