List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer close
@Override public void close() throws IOException
From source file:com.github.jiloc.USTweetsAnalyzer.Analyzer_Index.java
/** * Take in input a string and tokenize it into an ArrayList of strings(tokens) which is returned * @param text - a string that has to be splited * @return an ArrayList of strings /*w w w.jav a2 s. c om*/ * @throws IOException */ public ArrayList<String> tokenizeText(String text) throws IOException { StringReader reader = new StringReader(text); StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_41, reader); CharTermAttribute charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class); tokenizer.reset(); ArrayList<String> tokens = new ArrayList<String>(); while (tokenizer.incrementToken()) { tokens.add(charTermAttrib.toString()); } tokenizer.end(); tokenizer.close(); // System.out.println("tokenizetext: "+tokens.toString()); return tokens; }
From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java
License:Open Source License
public ResponsePojo getSuggestions(String userIdStr, String term, String communityIdStrList, boolean bIncludeGeo, boolean bIncludeLinkdata, boolean bWantNoAlias) { long nSysTime = System.currentTimeMillis(); ResponsePojo rp = new ResponsePojo(); ElasticSearchManager gazIndex = ElasticSearchManager.getIndex(entityIndex_); // Need to do a quick decomposition of the term to fit in with analyzed strings String escapedterm = null;/* ww w . jav a 2 s . com*/ StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30, new StringReader(ContentUtils.stripDiacritics(term))); CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class); StringBuffer sb = new StringBuffer(); try { try { st.reset(); while (st.incrementToken()) { if (sb.length() > 0) { sb.append(" +"); } else { sb.append('+'); } sb.append(luceneEncodeTerm(termAtt.toString())); } } finally { st.close(); } } catch (IOException e) { e.printStackTrace(); } if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end sb.append('*'); } //TESTED escapedterm = sb.toString(); // Create the search query SearchRequestBuilder searchOptions = gazIndex.getSearchOptions(); BaseQueryBuilder queryObj1 = QueryBuilders.queryString(escapedterm) .defaultField(EntityFeaturePojoIndexMap.Mapping.RootObject.RootProperties.alias_pri_); String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList); BaseQueryBuilder queryObj2 = QueryBuilders.boolQuery() .should(QueryBuilders.termsQuery(EntityFeaturePojo.communityId_, communityIdStrs)); BaseQueryBuilder queryObj = QueryBuilders.boolQuery().must(queryObj1).must(queryObj2); searchOptions.addSort(EntityFeaturePojo.doccount_, SortOrder.DESC); searchOptions.addFields(EntityFeaturePojo.disambiguated_name_, EntityFeaturePojo.doccount_, EntityFeaturePojo.type_, EntityFeaturePojo.dimension_); if (bIncludeGeo) { searchOptions.addFields(EntityFeaturePojo.geotag_); searchOptions.addFields(EntityFeaturePojo.ontology_type_); } if (bIncludeLinkdata) { searchOptions.addFields(EntityFeaturePojo.linkdata_); } // Initial alias handling: AliasLookupTable aliasTable = null; HashMap<String, SearchSuggestPojo> aliasResults = null; if (!bWantNoAlias) { AliasManager aliasManager = AliasManager.getAliasManager(); if (null != aliasManager) { aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr); } } //TESTED // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them List<EntityFeaturePojo> extraEntries = null; if (null != aliasTable) { extraEntries = checkAliasMasters(aliasTable, escapedterm); } // (end initial alias handling) int nDesiredSize = 20; if (null == aliasTable) { searchOptions.setSize(nDesiredSize); // will forward all 20 } else { searchOptions.addFields(EntityFeaturePojo.index_); searchOptions.setSize(3 * nDesiredSize); // will forward top 20 after de-aliasing aliasResults = new HashMap<String, SearchSuggestPojo>(); // (We use this to ensure we only include each entity once after aliasing) } //TESTED // Perform the search SearchResponse rsp = gazIndex.doQuery(queryObj, searchOptions); // Format the return values SearchHit[] docs = rsp.getHits().getHits(); DimensionListPojo dimlist = new DimensionListPojo(); int nDocsAdded = 0; if (null != extraEntries) { // Put the alias masters at the top: //DEBUG //System.out.println(Arrays.toString(extraEntries.toArray())); for (EntityFeaturePojo alias : extraEntries) { SearchSuggestPojo sp = new SearchSuggestPojo(); if (null != alias.getDimension()) { sp.setDimension(alias.getDimension().toString()); } else { sp.setDimension("What"); } sp.setValue(alias.getDisambiguatedName()); sp.setType(alias.getType()); if (bIncludeGeo) { sp.setGeotag(alias.getGeotag()); } sp.setOntology_type(alias.getOntology_type()); dimlist.addSearchSuggestPojo(sp); } } //TESTED (inc geo) if (null != docs) { for (SearchHit hit : docs) { SearchHitField shf = hit.field(EntityFeaturePojo.disambiguated_name_); if (null == shf) { // robustness check, sometimes if the harvester goes wrong this field might be missing continue; } String disname = (String) shf.value(); String type = (String) hit.field(EntityFeaturePojo.type_).value(); String dimension = (String) hit.field(EntityFeaturePojo.dimension_).value(); SearchSuggestPojo sp = new SearchSuggestPojo(); sp.setValue(disname); sp.setDimension(dimension); sp.setType(type); if (bIncludeGeo) { SearchHitField loc = hit.field(EntityFeaturePojo.geotag_); if (loc != null) sp.setLocFromES((String) loc.value()); SearchHitField ont = hit.field(EntityFeaturePojo.ontology_type_); if (ont != null) sp.setOntology_type((String) ont.value()); } if (bIncludeLinkdata) { SearchHitField linkdata = hit.field(EntityFeaturePojo.linkdata_); if (linkdata != null) sp.setLinkdata(linkdata.values()); } // More alias handling String index = null; if (null != aliasTable) { index = (String) hit.field(EntityFeaturePojo.index_).value(); EntityFeaturePojo alias = aliasTable.getAliasMaster(index); if (null != alias) { // Found! if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity continue; } else if ((null != alias.getDisambiguatedName()) && (null != alias.getType())) { // (these need to be present) //DEBUG (perf critical) //logger.debug("Alias! Replace " + index + " with " + alias.getIndex()); index = alias.getIndex(); disname = alias.getDisambiguatedName(); type = alias.getType(); if (null != alias.getDimension()) { dimension = alias.getDimension().toString(); } else { // Guess from type dimension = DimensionUtility.getDimensionByType(type).toString(); } // Reset values: sp.setValue(disname); sp.setDimension(dimension); sp.setType(type); } } SearchSuggestPojo existing = aliasResults.get(index); if (null != existing) { //DEBUG (perf critical) //logger.debug("Alias! Remove duplicate " + index); if ((null == existing.getGeotag()) && (null != sp.getGeotag())) { // (if they're both set then sigh just ignore on a first-come-first-served basis) existing.setGeotag(sp.getGeotag()); existing.setOntology_type(sp.getOntology_type()); } //TESTED if (null != sp.getLinkdata()) { // (here we can just combine the linkdata) if (null == existing.getLinkdata()) { existing.setLinkdata(sp.getLinkdata()); } else { existing.getLinkdata().addAll(sp.getLinkdata()); } } //TESTED continue; // (ie don't add this guy) } else { // add it aliasResults.put(index, sp); } } //TESTED // end more alias handing dimlist.addSearchSuggestPojo(sp); // (only adds unique entries, ie handles multiple communities "ok" (only ok // because it doesn't sum the doccounts across multiple communities, you'd probably // want to use facets for that, but it doesn't seem worth it, especially since we're // pretty short on field cache space) if (++nDocsAdded >= nDesiredSize) { // (can happen in the de-aliasing case) break; } //TESTED } } rp.setData(dimlist); rp.setResponse(new ResponseObject("Suggestions", true, term)); if (nSysTime > (lastSuggestLog + 5000)) { lastSuggestLog = nSysTime; logMsg.setLength(0); logMsg.append("knowledge/searchSuggest query=").append(escapedterm); logMsg.append(" groups=").append(communityIdStrList); logMsg.append(" found=").append(docs.length); logMsg.append(" time=").append(System.currentTimeMillis() - nSysTime).append(" ms"); logger.info(logMsg.toString()); } return rp; }
From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java
License:Open Source License
public ResponsePojo getAssociationSuggestions(String userIdStr, String ent1, String verb, String ent2, String field, String communityIdStrList, boolean bWantNoAlias) { ResponsePojo rp = new ResponsePojo(); try {/* w w w. j ava 2 s . c om*/ // Community ids, needed in a couple of places String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList); // Initial alias handling: AliasLookupTable aliasTable = null; // Initial alias handling: if (!bWantNoAlias) { AliasManager aliasManager = AliasManager.getAliasManager(); if (null != aliasManager) { aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr); } } //TESTED ElasticSearchManager esm = ElasticSearchManager.getIndex(assocIndex_); SearchRequestBuilder searchOptions = esm.getSearchOptions(); BoolQueryBuilder boolQuery = QueryBuilders.boolQuery(); boolean bExtraQueryTerms = false; String term = ""; if (!ent1.equals("null")) { if (field.equals(AssociationFeaturePojo.entity1_)) term = ent1; else { bExtraQueryTerms = true; EntityFeaturePojo alias = null; if (null != aliasTable) { alias = aliasTable.getAliasMaster(ent1); } if (null != alias) { // Found! boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity1_index_, alias.getAlias().toArray())); } else { boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity1_index_, ent1)); } //TESTED } } if (!verb.equals("null")) { if (field.equals(AssociationFeaturePojo.verb_)) term = verb; else { bExtraQueryTerms = true; boolQuery.must(QueryBuilders .queryString(new StringBuffer("+").append(verb.replaceAll("\\s+", " +")).toString()) .defaultField(AssociationFeaturePojo.verb_)); } } if (!ent2.equals("null")) { if (field.equals(AssociationFeaturePojo.entity2_)) term = ent2; else { bExtraQueryTerms = true; EntityFeaturePojo alias = null; if (null != aliasTable) { alias = aliasTable.getAliasMaster(ent2); } if (null != alias) { // Found! boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity2_index_, alias.getAlias().toArray())); } else { boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity2_index_, ent2)); } } //TESTED (cut and paste from entity1) } String escapedterm = null; StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30, new StringReader(ContentUtils.stripDiacritics(term))); CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class); StringBuffer sb = new StringBuffer(); try { try { st.reset(); while (st.incrementToken()) { if (sb.length() > 0) { sb.append(" +"); } else { sb.append('+'); } sb.append(luceneEncodeTerm(termAtt.toString())); } } finally { st.close(); } } catch (IOException e) { e.printStackTrace(); } if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end sb.append('*'); } //TESTED escapedterm = sb.toString(); // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them List<EntityFeaturePojo> extraEntries = null; BoolQueryBuilder extraQueryTerms = null; if (field.startsWith("entity")) { String indexField = field.startsWith("entity1") ? "entity1_index" : "entity2_index"; if (null != aliasTable) { extraEntries = checkAliasMasters(aliasTable, escapedterm); } if (null != extraEntries) { extraQueryTerms = QueryBuilders.boolQuery(); int nExtraTerms = 0; Iterator<EntityFeaturePojo> aliasIt = extraEntries.iterator(); while (aliasIt.hasNext()) { EntityFeaturePojo alias = aliasIt.next(); nExtraTerms += alias.getAlias().size(); if (!bExtraQueryTerms && (nExtraTerms > 20)) { // If not filtering on event type we'll be more aggressive break; } //TESTED if (bExtraQueryTerms && (nExtraTerms > 60)) { // If the number of terms gets too large bail anyway break; } //TESTED extraQueryTerms.should(QueryBuilders.termsQuery(indexField, alias.getAlias().toArray())); aliasIt.remove(); } //end loop over entities } //if found new aliases } //(if this is an entity lookup) TESTED - including breaking out because of # of terms // (end initial alias handling) if (null == extraQueryTerms) { boolQuery.must(QueryBuilders.queryString(escapedterm).defaultField(field)); } else {//(in this case combine the escaped term with the aliases extraQueryTerms.should(QueryBuilders.queryString(escapedterm).defaultField(field)); boolQuery.must(extraQueryTerms); } //TESTED boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.communityId_, communityIdStrs)); searchOptions.addSort(AssociationFeaturePojo.doccount_, SortOrder.DESC); // Work out which fields to return: //TODO (INF-1234) need to work out what to do with quotations and similar here (ie entityX without entityX_index) String returnfield; boolean bReturningEntities = true; if (field.equals(AssociationFeaturePojo.entity1_)) { returnfield = AssociationFeaturePojo.entity1_index_; searchOptions.addFields(AssociationFeaturePojo.entity1_index_, AssociationFeaturePojo.doccount_); } else if (field.equals(AssociationFeaturePojo.entity2_)) { returnfield = AssociationFeaturePojo.entity2_index_; searchOptions.addFields(AssociationFeaturePojo.entity2_index_, AssociationFeaturePojo.doccount_); } else { bReturningEntities = false; returnfield = AssociationFeaturePojo.verb_; searchOptions.addFields(AssociationFeaturePojo.verb_, AssociationFeaturePojo.verb_category_, AssociationFeaturePojo.doccount_); } int nNumSuggestionsToReturn = 20; if (bReturningEntities && (null != aliasTable)) { searchOptions.setSize(3 * nNumSuggestionsToReturn); // we're going to remove some duplicates so get more than we need } else { // normal case searchOptions.setSize(nNumSuggestionsToReturn); } SearchResponse rsp = esm.doQuery(boolQuery, searchOptions); SearchHit[] docs = rsp.getHits().getHits(); //Currently this code takes the results and puts //them into a set so there are no duplicates //duplicates occur for example when you search for //obama you get obama/quotation/quote1 and obama/travel/spain //may want to work this differnt, or atleast sum up //frequency Set<String> suggestions = new HashSet<String>(); for (SearchHit hit : docs) { SearchHitField retField = hit.field(returnfield); // (this can be null in theory/by mistake) if (null != retField) { String suggestion = (String) retField.value(); if (bReturningEntities && (null != aliasTable)) { // More alias handling EntityFeaturePojo alias = aliasTable.getAliasMaster(suggestion); if (null != alias) { // Found! if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity continue; } else { // (these need to be present) suggestion = alias.getIndex(); } } //TESTED } else { // (old code, still valid for verbs or no aliases) if (returnfield.equals(AssociationFeaturePojo.verb_) && hit.field(AssociationFeaturePojo.verb_category_) != null) //for some reason verb_cat can be null!?!?! i think this is broken (ent1 facebook inc/company verb *) { String verbcat = (String) hit.field(AssociationFeaturePojo.verb_category_).value(); suggestion += " (" + verbcat + ")"; suggestions.add(verbcat); } } suggestions.add(suggestion); if (suggestions.size() >= nNumSuggestionsToReturn) { break; } } // (end return string valid) } //end loop over suggestions // Add any aliases that I couldn't explicity convert to query terms if ((null != extraEntries) && (suggestions.size() < nNumSuggestionsToReturn)) { for (EntityFeaturePojo alias : extraEntries) { suggestions.add(alias.getIndex()); if (suggestions.size() >= nNumSuggestionsToReturn) { break; } } } //(end add any remaining entries) //TESTED String[] suggestionArray = new String[suggestions.size()]; rp.setData(Arrays.asList(suggestions.toArray(suggestionArray)), (BasePojoApiMap<String>) null); String searchTerm = ""; if (field.equals(AssociationFeaturePojo.entity1_)) searchTerm = ent1; else if (field.equals(AssociationFeaturePojo.verb_)) searchTerm = verb; else searchTerm = ent2; rp.setResponse(new ResponseObject("Association Suggestions", true, searchTerm)); } catch (Exception ex) { ex.printStackTrace(); rp.setResponse(new ResponseObject("Association Suggestions", false, "Response returned unsuccessfully: " + ex.getMessage())); } return rp; }