List of usage examples for org.apache.lucene.document Document getFields
public IndexableField[] getFields(String name)
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * This methods performs the analysis for the seed document and extract the boosts if present. * This is done only one time for the Seed Document. * * @param inputDocument the seed unseen document * @param fieldName2tokensArray a map that associated to a field name the list of token arrays for all its values * @param fieldName2boost a map that associates the boost to the field * @throws IOException If there is a low-level I/O error *//*from ww w . j a va 2s. c om*/ private void analyzeSeedDocument(Document inputDocument, Map<String, List<String[]>> fieldName2tokensArray, Map<String, Float> fieldName2boost) throws IOException { for (int i = 0; i < textFieldNames.length; i++) { String fieldName = textFieldNames[i]; float boost = 1; List<String[]> tokenizedValues = new LinkedList<>(); if (fieldName.contains("^")) { String[] field2boost = fieldName.split("\\^"); fieldName = field2boost[0]; boost = Float.parseFloat(field2boost[1]); } IndexableField[] fieldValues = inputDocument.getFields(fieldName); for (IndexableField fieldValue : fieldValues) { TokenStream fieldTokens = fieldValue.tokenStream(field2analyzer.get(fieldName), null); String[] fieldTokensArray = getTokenArray(fieldTokens); tokenizedValues.add(fieldTokensArray); } fieldName2tokensArray.put(fieldName, tokenizedValues); fieldName2boost.put(fieldName, boost); textFieldNames[i] = fieldName; } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms *///from ww w. j a v a 2 s. co m private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Terms vector = ir.getTermVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
From source file:axiom.objectmodel.dom.LuceneManager.java
License:Open Source License
public static HashMap luceneDocumentToMap(Document doc) { HashMap map = new HashMap(); Enumeration e = doc.fields(); while (e.hasMoreElements()) { Field f = (Field) e.nextElement(); String fieldname = f.name(); if (LuceneManager.isSearchOnlyField(fieldname)) { map.put(fieldname, undoFields(doc.getFields(fieldname))); } else {//from w ww. java 2 s .c om map.put(fieldname, f.stringValue()); } } return map; }
From source file:axiom.objectmodel.dom.LuceneManager.java
License:Open Source License
public Key[] getTargetNodeIds(final String id, final int mode, ArrayList protos, BooleanQuery append, Sort sort) throws Exception { IndexSearcher searcher = null;// ww w . ja va 2 s.co m Document doc = null; BooleanQuery query = null; try { searcher = this.getIndexSearcher(); String idvalue = id; Query id_query = new TermQuery(new Term(ID, idvalue)); for (int i = mode; i >= LIVE_MODE; i--) { if (i != LIVE_MODE && !isDraftNode(id, mode)) { continue; } query = new BooleanQuery(); query.add(id_query, BooleanClause.Occur.MUST); query.add(new TermQuery(new Term(LAYER_OF_SAVE, i + "")), BooleanClause.Occur.MUST); Hits hits = searcher.search(query); /*if (app.debug()) app.logEvent("LuceneManager.getTargetNodeIds(): id=" + id + ",layer=" + mode + " executed query [" + query + "] which resulted in " + hits.length() + " hits");*/ if (hits.length() > 0) { doc = hits.doc(0); break; } } } catch (Exception ex) { app.logError( ErrorReporter.errorMsg(this.getClass(), "getSourceReferences") + "Could not retrieve document " + id + " from Lucene index with query = " + (query != null ? query : "null"), ex); throw ex; } finally { this.releaseIndexSearcher(searcher); } if (doc == null) { return new Key[0]; } Field[] fields = doc.getFields(REF_LIST_FIELD); int len; if ((fields == null) || ((len = fields.length) == 0)) { return new Key[0]; } ArrayList<Key> keys = new ArrayList<Key>(); doc = null; for (int i = 0; i < len; i++) { doc = null; query = new BooleanQuery(); String refid = getIdFromRefListField(fields[i]); query.add(new TermQuery(new Term(ID, refid)), BooleanClause.Occur.MUST); BooleanQuery proto_query = null; final int sizeOfProtos; if ((sizeOfProtos = protos.size()) > 0) { proto_query = new BooleanQuery(); for (int j = 0; j < sizeOfProtos; j++) { proto_query.add(new TermQuery(new Term(PROTOTYPE, (String) protos.get(j))), BooleanClause.Occur.SHOULD); } query.add(proto_query, BooleanClause.Occur.MUST); } if (append != null && append.getClauses().length > 0) { query.add(append, BooleanClause.Occur.MUST); } for (int j = mode; j >= LIVE_MODE; j--) { if (j != LIVE_MODE && !isDraftNode(refid, mode)) { continue; } query.add(new TermQuery(new Term(LAYER_OF_SAVE, j + "")), BooleanClause.Occur.MUST); Hits hits = searcher.search(query); /*if (app.debug()) app.logEvent("LuceneManager.getTargetNodeIds() [for retrieving target " + "keys]: id=" + id + ",layer=" + mode + " executed query [" + query + "] which resulted in " + hits.length() + " hits");*/ if (hits.length() > 0) { doc = hits.doc(0); break; } } if (doc != null) { keys.add(new DbKey(this.app.getDbMapping(doc.get(PROTOTYPE)), doc.get(ID), mode)); } } Key[] key_arr = new Key[keys.size()]; return keys.toArray(key_arr); }
From source file:axiom.objectmodel.dom.LuceneManager.java
License:Open Source License
public static boolean isIdInDocumentRefs(Document doc, String id) { Field[] ref_fields = doc.getFields(REF_LIST_FIELD); final int ref_length = ref_fields != null ? ref_fields.length : 0; for (int i = 0; i < ref_length; i++) { String[] values = ref_fields[i].stringValue().split(NULL_DELIM); if (id.equals(values[0])) { return true; }/*w w w .ja v a2 s .c o m*/ } return false; }
From source file:axiom.scripting.rhino.LuceneQueryDispatcher.java
License:Open Source License
private void luceneResultsToReferences(final Hits hits, final ArrayList results, final HashSet targets, final int mode) throws Exception { final int hitslen = hits.length(); final String ID = LuceneManager.ID; final String REF_FIELD = LuceneManager.REF_LIST_FIELD; final String DELIM = LuceneManager.NULL_DELIM; final Context cx = Context.getCurrentContext(); final GlobalObject global = this.core != null ? this.core.global : null; if (global == null) { return;/*from w ww.ja v a 2 s.co m*/ } for (int i = 0; i < hitslen; i++) { Document d = hits.doc(i); Field f = d.getField(ID); if (f == null) { continue; } final String source_id = f.stringValue(); Field[] ref_fields = d.getFields(REF_FIELD); int ref_length = ref_fields != null ? ref_fields.length : 0; for (int j = 0; j < ref_length; j++) { String[] values = ref_fields[j].stringValue().split(DELIM); if (targets.contains(values[0])) { final Object[] args = { new DbKey(null, values[0], mode) }; Object o = cx.newObject(global, "Reference", args); Reference relobj = (Reference) o; relobj.setSourceKey(new DbKey(null, source_id, mode)); relobj.setSourceProperty(values[1]); if (values.length > 2) { relobj.setSourceIndex(Integer.parseInt(values[2])); if (values.length > 3) { relobj.setSourceXPath(values[3]); } } results.add(relobj); } } } }
From source file:com.b2international.index.lucene.IndexFieldBase.java
License:Apache License
public final IndexableField[] getFields(Document doc) { return doc.getFields(fieldName()); }
From source file:com.core.nlp.query.MoreLikeThis.java
License:Apache License
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms *//*from w w w . java 2s .c o m*/ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
From source file:com.esri.gpt.catalog.lucene.LuceneIndexAdapter.java
License:Apache License
/** * Queries the ACL values indexed for a document. * @param uuid the document UUID/*from ww w. j a va 2 s . c o m*/ * @return the ACL values (can be null) * @throws CatalogIndexException if an exception occurs */ @Override public String[] queryAcls(String uuid) throws CatalogIndexException { ArrayList<String> values = new ArrayList<String>(); IndexSearcher searcher = null; TermDocs termDocs = null; try { uuid = Val.chkStr(uuid); if (uuid.length() > 0) { searcher = newSearcher(); String[] aFields = new String[] { Storeables.FIELD_ACL }; MapFieldSelector selector = new MapFieldSelector(aFields); searcher = newSearcher(); IndexReader reader = searcher.getIndexReader(); termDocs = reader.termDocs(); termDocs.seek(new Term(Storeables.FIELD_UUID, uuid)); if (termDocs.next()) { Document document = reader.document(termDocs.doc(), selector); Field[] fields = document.getFields(Storeables.FIELD_ACL); if ((fields != null) && (fields.length > 0)) { for (Field field : fields) { values.add(field.stringValue()); } } } } } catch (IOException e) { String sMsg = "Error accessing index:\n " + Val.chkStr(e.getMessage()); throw new CatalogIndexException(sMsg, e); } finally { try { if (termDocs != null) termDocs.close(); } catch (Exception ef) { } closeSearcher(searcher); } return values.toArray(new String[0]); }
From source file:com.esri.gpt.catalog.lucene.LuceneIndexSynchronizer.java
License:Apache License
/** * Walks the documents within the database modifying the index as required. * @param info synchronization step information * @throws IOException if an exception occurs while communicating with the index * @throws SQLException if an exception occurs while communicating with the database * @throws CatalogIndexException if an exception occurs while modifying the index *//*from w w w . j ava 2 s .com*/ private void walkDatabase(WalkDatabaseInfo info) throws IOException, SQLException, CatalogIndexException { LOGGER.fine("Checking database records..."); PreparedStatement st = null; PreparedStatement stCol = null; TermDocs termDocs = null; try { // determine the metadata acl policy MetadataAcl acl = new MetadataAcl(this.context); boolean bCheckAcl = !acl.isPolicyUnrestricted(); // determine if collections are being used List<String[]> collections = null; CollectionDao colDao = new CollectionDao(this.context); boolean hasCollections = false; boolean useCollections = colDao.getUseCollections(); String sColMemberTable = colDao.getCollectionMemberTableName(); String sqlCol = "SELECT COLUUID FROM " + sColMemberTable + " WHERE DOCUUID=?"; if (useCollections) { collections = colDao.queryCollections(); hasCollections = (collections.size() > 0); } // initialize index related variables boolean bCheckIndex = (info.numOriginallyIndexed > 0); String fldUuid = Storeables.FIELD_UUID; String fldModified = Storeables.FIELD_DATEMODIFIED; String fldAcl = Storeables.FIELD_ACL; ArrayList<String> alFields = new ArrayList<String>(); alFields.add(fldModified); if (bCheckAcl) alFields.add(fldAcl); if (useCollections) alFields.add("isPartOf"); FieldSelector selector = new MapFieldSelector(alFields.toArray(new String[0])); Term termUuid = new Term(fldUuid); if (bCheckIndex) { termDocs = this.reader.termDocs(); } StringSet delUuids = new StringSet(); // build the database query StringBuffer sb = new StringBuffer("SELECT"); sb.append(" ").append(this.resourceTable).append(".DOCUUID"); sb.append(",").append(this.resourceTable).append(".APPROVALSTATUS"); sb.append(",").append(this.resourceTable).append(".PROTOCOL_TYPE"); sb.append(",").append(this.resourceTable).append(".FINDABLE"); sb.append(",").append(this.resourceTable).append(".UPDATEDATE"); sb.append(",").append(this.resourceTable).append(".ACL"); sb.append(" FROM ").append(this.resourceTable); String sql = sb.toString(); LOGGER.finest(sql); // execute the query, walk through the database records Connection con = this.context.getConnectionBroker().returnConnection("").getJdbcConnection(); st = con.prepareStatement(sql); ResultSet rs = st.executeQuery(); if (this.checkInterrupted()) return; if (useCollections && hasCollections) { stCol = con.prepareStatement(sqlCol); } while (rs.next()) { info.numProcessed++; info.loopCount++; long nDbTimeModified = 0; Timestamp tsDbModified = null; String sDbAcl = null; boolean bIndexable = false; // read the database uuid and approval status String uuid = rs.getString(1); String status = rs.getString(2); String protocolType = Val.chkStr(rs.getString(3)); boolean findable = Val.chkBool(rs.getString(4), false); bIndexable = (status != null) && (status.equalsIgnoreCase("approved") || status.equalsIgnoreCase("reviewed")); if (bIndexable && protocolType.length() > 0 && !findable) { bIndexable = false; } // read the database modification date if (bIndexable) { tsDbModified = rs.getTimestamp(5); if (tsDbModified != null) { nDbTimeModified = tsDbModified.getTime(); } bIndexable = (nDbTimeModified > 0); } // for non-indexable documents, delete if (!bIndexable) { info.numNonIndexable++; if (bCheckIndex) { termDocs.seek(termUuid.createTerm(uuid)); if (termDocs.next()) { info.numNonIndexableFound++; info.numRequiringDelete++; delUuids.add(uuid); if (delUuids.size() >= this.maxDeleteTokens) { if (this.checkInterrupted()) return; this.deleteDocuments(delUuids); info.numDocsDeleted += delUuids.size(); delUuids.clear(); if (this.checkInterrupted()) return; } } } } // for indexable documents, check to ensure that they are in sync if (bIndexable) { info.numIndexable++; boolean bRequiresUpdate = true; // find the document within the index if (bCheckIndex) { termDocs.seek(termUuid.createTerm(uuid)); if (termDocs.next()) { info.numIndexableFound++; Document doc = this.reader.document(termDocs.doc(), selector); if (doc != null) { bRequiresUpdate = false; // check the modification date long nIdxTimeModified = 0; String sModified = doc.get(fldModified); if (sModified != null) { try { nIdxTimeModified = Long.valueOf(sModified); } catch (NumberFormatException e) { nIdxTimeModified = 0; } } bRequiresUpdate = (nIdxTimeModified == 0) || (nDbTimeModified > nIdxTimeModified); if (bRequiresUpdate) info.numWithInconsistentDates++; // check the acl if (!bRequiresUpdate && bCheckAcl) { long tAclStartMillis = System.currentTimeMillis(); bRequiresUpdate = true; String[] aclsDb = null; sDbAcl = rs.getString(6); try { // use an internal method for quick parsing //aclsDb = acl.makeDocumentAcl(sDbAcl); aclsDb = this.parseAcl(sDbAcl); } catch (Exception eacl) { String sMsg = "Error parsing acl"; sMsg += ", uuid=" + uuid + "\n" + Val.chkStr(eacl.getMessage()); LOGGER.log(Level.WARNING, sMsg, eacl); } if (aclsDb == null) aclsDb = new String[0]; ArrayList<String> aclsIdx = new ArrayList<String>(); Field[] aclFields = doc.getFields(fldAcl); if ((aclFields != null) && (aclFields.length > 0)) { for (Field aclField : aclFields) { aclsIdx.add(aclField.stringValue()); } } if (aclsDb.length == aclsIdx.size()) { int nMatched = 0; if (aclsDb.length > 0) { for (String s1 : aclsDb) { for (String s2 : aclsIdx) { if (s1.equalsIgnoreCase(s2)) { nMatched++; break; } } } } bRequiresUpdate = (nMatched != aclsDb.length); } if (bRequiresUpdate) info.numWithInconsistentAcls++; info.aclMillis += (System.currentTimeMillis() - tAclStartMillis); } // check collection membership if (!bRequiresUpdate && useCollections) { long tColStartMillis = System.currentTimeMillis(); bRequiresUpdate = true; ArrayList<String> colDb = new ArrayList<String>(); if (useCollections && hasCollections) { stCol.clearParameters(); stCol.setString(1, uuid); ResultSet rsCol = stCol.executeQuery(); while (rsCol.next()) { String sCUuid = rsCol.getString(1); for (String[] col : collections) { if (sCUuid.equals(col[0])) { colDb.add(col[1]); break; } } } rsCol.close(); } ArrayList<String> colIdx = new ArrayList<String>(); Field[] colFields = doc.getFields("isPartOf"); if ((colFields != null) && (colFields.length > 0)) { for (Field colField : colFields) { colIdx.add(colField.stringValue()); } } if (colDb.size() == colIdx.size()) { int nMatched = 0; if (colDb.size() > 0) { for (String s1 : colDb) { for (String s2 : colIdx) { if (s1.equalsIgnoreCase(s2)) { nMatched++; break; } } } } bRequiresUpdate = (nMatched != colDb.size()); } if (bRequiresUpdate) info.numWithInconsistentColMembership++; info.colMillis += (System.currentTimeMillis() - tColStartMillis); } } } } // execute the update if required if (bRequiresUpdate) { if (this.checkInterrupted()) return; try { if (bCheckAcl) { if (sDbAcl == null) sDbAcl = rs.getString(6); } String sXml = Val.chkStr(this.readXml(uuid)); if (sXml.length() > 0) { info.numRequiringUpdate++; MetadataDocument mdDoc = new MetadataDocument(); Schema schema = mdDoc.prepareForView(this.context, sXml); this.adapter.publishDocument(uuid, tsDbModified, schema, sDbAcl); info.numDocsUpdated++; } } catch (SchemaException se) { // dont' allow the entire process to fail over one bad xml String sMsg = "Error indexing document during synchronization"; sMsg += ", uuid=" + uuid + "\n" + Val.chkStr(se.getMessage()); LOGGER.log(Level.WARNING, sMsg, se); } if (this.checkInterrupted()) return; } } // cache the synchronized uuids if (this.synchedUuidCache != null) { this.synchedUuidCache.put(uuid, ""); if (this.synchedUuidCache.size() > this.maxUuidCache) { this.synchedUuidCache = null; } } // log a status message if the feedback threshold was reached if (this.checkInterrupted()) return; if ((System.currentTimeMillis() - info.loopStartMillis) >= this.feedbackMillis) { LOGGER.info(info.getLoopMessage()); } } // delete any documents left over in the buffer if (delUuids.size() >= 0) { if (this.checkInterrupted()) return; this.deleteDocuments(delUuids); info.numDocsDeleted += delUuids.size(); } LOGGER.info(info.getStepMessage()); } finally { try { if (st != null) st.close(); } catch (Exception ef) { } try { if (stCol != null) stCol.close(); } catch (Exception ef) { } try { if (termDocs != null) termDocs.close(); } catch (Exception ef) { } } }