Java tutorial
// Copyright 2007 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.enterprise.connector.sharepoint.client; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.enterprise.connector.sharepoint.client.AlertsHelper; import com.google.enterprise.connector.sharepoint.client.SPConstants.FeedType; import com.google.enterprise.connector.sharepoint.client.SPConstants.SPType; import com.google.enterprise.connector.sharepoint.client.UserProfile2003Helper; import com.google.enterprise.connector.sharepoint.client.UserProfile2007Helper; import com.google.enterprise.connector.sharepoint.spiimpl.SPDocument; import com.google.enterprise.connector.sharepoint.spiimpl.SPDocumentList; import com.google.enterprise.connector.sharepoint.spiimpl.SharepointException; import com.google.enterprise.connector.sharepoint.state.GlobalState; import com.google.enterprise.connector.sharepoint.state.ListState; import com.google.enterprise.connector.sharepoint.state.WebState; import com.google.enterprise.connector.sharepoint.wsclient.client.ClientFactory; import com.google.enterprise.connector.sharepoint.wsclient.client.ListsWS; import com.google.enterprise.connector.spi.SpiConstants.ActionType; import org.apache.axis.utils.XMLUtils; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; /** * This class provides a layer of abstraction between the SharePoint Traversal * Manager and the java clients for making web service calls. Every time * traversal is started/resumed, connector goes through this layer. This class * has the inteliigence to know which web service should be consulted for some * purpose. This class has all the methods needed to get documents and sites * from the sharepoint server. */ public class SharepointClient { private static final Logger LOGGER = Logger.getLogger(SharepointClient.class.getName()); private final SharepointClientContext sharepointClientContext; private final ClientFactory clientFactory; private int nDocuments = 0; // true -> when threshold is not reached and all webs // all lists all documents are done. // false -> when a partial cycle is completed i.e, threshold is // reached before processing all the documents. private boolean doCrawl; // This is mainly for test cases. It gives the count of liststates that are // checked for any docs pending from previous crawl cycle private int noOfVisitedListStates = 0; public SharepointClient(final ClientFactory clientFactory, final SharepointClientContext inSharepointClientContext) throws SharepointException { this.clientFactory = clientFactory; sharepointClientContext = inSharepointClientContext; // Register a SAX client factory with Axis so that we can intercept SAX // parsing failures. This is needed to ignore some SAX parsing failures // such as duplicate attributes defined in the metadata of a document. XMLUtils.initSAXFactory("com.google.enterprise.connector.sharepoint.wsclient.handlers.SaxErrorFactory", true, false); } /** * For a single ListState, handle its crawl queue (if any). This means add it * to the ResultSet which we give back to the Connector Manager. * * @param globalState The recent snapshot of the whole in-memory state file. * @param web Represets the current web state * @param list Represents the current list state * @return {@link SPDocumentList} conatining the crawled documents. */ @VisibleForTesting SPDocumentList handleCrawlQueueForList(final GlobalState globalState, final WebState web, final ListState list) { if (null == web) { LOGGER.log(Level.WARNING, "web is not found"); return null; } if (null == list) { LOGGER.log(Level.WARNING, "list is not found"); return null; } final List<SPDocument> crawlQueue = list.getCrawlQueue(); if (null == crawlQueue || crawlQueue.size() <= 0) { LOGGER.log(Level.FINE, "No CrawlQueue.."); return null; } ImmutableList.Builder<SPDocument> newListBuilder = new ImmutableList.Builder<SPDocument>(); for (SPDocument doc : list.getCrawlQueue()) { ListState parentList = doc.getParentList(); if (parentList == null) { LOGGER.log(Level.WARNING, "Document [{0}] is missing parent list. " + "Assigning [{1}] as parent list for document.", new Object[] { doc.getUrl(), list.getListURL() }); doc.setParentList(list); } else { if (!list.getPrimaryKey().equals(parentList.getPrimaryKey())) { LOGGER.log(Level.WARNING, "Skipping document . Parent List - crawl queue mismatch" + " for document [{0}]. Parent List is [{1}]. " + "Crawl Queue is associated with list is [{2}].", new Object[] { doc, parentList, list }); continue; } } doc.setParentWeb(web); doc.setSharepointClientContext(sharepointClientContext); // Update necessary information required for downloading contents. if (FeedType.CONTENT_FEED == doc.getFeedType()) { doc.setContentDwnldURL(doc.getUrl()); } newListBuilder.add(doc); LOGGER.log(Level.FINEST, "[ DocId = " + doc.getDocId() + ", URL = " + doc.getUrl() + " ]"); } ImmutableList<SPDocument> newlist = newListBuilder.build(); if (newlist.isEmpty()) { // If all documents are skipped because of possible // crawl queue mismatch, then clear crawl queue for list. list.setCrawlQueue(null); return null; } // Update crawl queue for list with filtered documents. list.setCrawlQueue(newlist); final SPDocumentList docList = new SPDocumentList(newlist, globalState); // FIXME These could be set in traversal manager just before returning // start/resumeTraversal if (null != sharepointClientContext) { // FIXME These could be set in traversal manager just before // returning // start/resumeTraversal docList.setAliasMap(sharepointClientContext.getAliasMap()); docList.setFQDNConversion(sharepointClientContext.isFQDNConversion()); docList.setReWriteDisplayUrlUsingAliasMappingRules( sharepointClientContext.isReWriteDisplayUrlUsingAliasMappingRules()); docList.setReWriteRecordUrlUsingAliasMappingRules( sharepointClientContext.isReWriteRecordUrlUsingAliasMappingRules()); } else { LOGGER.log(Level.SEVERE, "sharepointClientContext not found!"); } return docList; } /** * Scans the crawl queue of all the ListStates from a given WebState and * constructs a {@link SPDocumentList} object to be returned to CM. * {@link WebState#getCurrentListstateIterator()} takes care of the fact that * same list is not scanned twice in case the traversal has been resumed. * <p> * At the end, fetches the ACL of all the documents contained in the * {@link SPDocumentList} object. Ensures that ACL are not re-fetched when * documents from previous batch traversal are being returned. * <p> * <b>No documents are returned in case there are failures/errors while * retrieving ACLs</b> * <p> * Logs the {@link OutOfMemoryError} when fetching ACLs. For retry, need to * edit properties in connectorInstance.xml and restart * <ul> * <li>If 'fetchACLInBatches' is enabled, tries to fetch ACLs in smaller * batches of (n/aclBatchSizeFactor) (n being the number of documents).</li> * <li>Both 'fetchACLInBatches' and 'aclBatchSizeFactor' can be edited from * connectorInstance.xml</li> * </ul> * * @param globalState The {@link GlobalState} representing all the SharePoint * sites. Primary required when constructing the * {@link SPDocumentList} * @param webState The {@link WebState} whose lists ned to be scanned for * documents * @param sizeSoFar This indicates the number documents that have been * previously fetched and added to the global crawl queue. This is * useful in cases when a single list/site does not have sufficient * documents that can match the batchHint and hence multiple * site/lists need to be scanned. * @param sendPendingDocs True will indicate that documents retrieved as part * of previous batch traversal need to be sent. This will be the case * when connector returned batch Hint or little more docs, but the CM * did not feed all of them to GSA and checkPoint() was called, * implying there are docs from previous batch traversal to be sent. * In such a case, ACLs should not be re-fetched * @return {@link SPDocumentList} containing crawled {@link SPDocument}. */ public SPDocumentList traverse(final GlobalState globalState, final WebState webState, int sizeSoFar, boolean sendPendingDocs) { if (webState == null) { LOGGER.warning("global state is null"); return null; } noOfVisitedListStates = 0; SPDocumentList resultSet = null; Iterator<ListState> iter = sendPendingDocs ? webState.getIterator() : webState.getCurrentListstateIterator(); while (iter.hasNext()) { final ListState list = iter.next(); if (list.isSiteDefaultPage()) { continue; } // Mark this list as current list so that the next traversal // request starts from here and already scanned lists are not // unnecessarily re-scanned. webState.setCurrentList(list); if (list.getCrawlQueue() == null) { continue; } SPDocumentList resultsList = null; try { LOGGER.log(Level.FINE, "Handling crawl queue for list URL [ " + list.getListURL() + " ]. "); resultsList = handleCrawlQueueForList(globalState, webState, list); noOfVisitedListStates++; } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem in handling crawl queue for list URL [ " + list.getListURL() + " ]. ", e); } if ((resultsList != null) && (resultsList.size() > 0)) { LOGGER.log(Level.INFO, resultsList.size() + " document(s) to be sent from list URL [ " + list.getListURL() + " ]. "); if (resultSet == null) { resultSet = resultsList; } else { resultSet.addAll(resultsList); } } else { LOGGER.log(Level.FINE, "No documents to be sent from list URL [ " + list.getListURL() + " ]. "); } if (resultsList != null) { sizeSoFar += resultsList.size(); } // Check if the docs added so far meet the batchHint if (sizeSoFar >= sharepointClientContext.getBatchHint()) { LOGGER.info("Stopping traversal because batch hint " + sharepointClientContext.getBatchHint() + " has been reached. Processed documents: " + sizeSoFar); break; } } ListState listForWeb = webState.lookupList(webState.getPrimaryKey()); if (listForWeb != null) { SPDocumentList resultsList = handleCrawlQueueForList(globalState, webState, listForWeb); if (resultsList != null) { if (resultSet != null) { resultSet.addAll(resultsList); } else { resultSet = resultsList; } } } // Fetch ACL for all the documents crawled from the current WebState if (!handleACLForDocuments(resultSet, webState, globalState, sendPendingDocs)) { return null; } LOGGER.config(noOfVisitedListStates + " lists scanned from site " + webState.getWebUrl() + ". found " + resultSet + " docs"); return resultSet; } /** * If the connector is set to push ACL, fetches the ACL. Takes care to * consider that ACL is not retrieved more than once esp. for when documents * are pending from previous batch traversals * * @param resultSet The list of documents discovered in current/previous batch * traversals * @param webState The web state representing the site * @param globalState The global state representing the list of all sites and * their information * @param sendPendingDocs True if the documents were discovered in previous * batch traversal but fed in the current traversal OR false * otherwise * @return True if ACL was retrieved successfully OR false in case of any * exceptions/errors */ @VisibleForTesting boolean handleACLForDocuments(SPDocumentList resultSet, WebState webState, GlobalState globalState, boolean sendPendingDocs) { if (!sharepointClientContext.isPushAcls()) { // When the connector is not set to feed ACLs no further checks are // required, just return true to send docs to CM and GSA return true; } if (resultSet == null || resultSet.size() == 0) { return true; } if (sendPendingDocs) { boolean missingAcls = false; for (SPDocument document : resultSet.getDocuments()) { missingAcls = document.isMissingAcls(); if (missingAcls) { LOGGER.log(Level.WARNING, "Document [{0}] is missing ACL. This is an overflow document " + "from WebState [{1}]. Fetching ACLs for this batch.", new Object[] { document.getUrl(), webState.getWebUrl() }); break; } } if (!missingAcls) { // This is to indicate that ACLs have been retrieved previously and // hence just return the set of docs return true; } } boolean aclRetrievalResult; // Fetch ACL for all the documents crawled from the current WebState // Do not try to re-fetch the ACL when documents are pending from // previous batch traversals int aclBatchSize = sharepointClientContext.getAclBatchSize(); if (aclBatchSize <= 0) { aclRetrievalResult = fetchACLForDocuments(resultSet, webState, globalState); } else { aclRetrievalResult = fetchACLInBatches(resultSet, webState, globalState, aclBatchSize); } // Resolve SP Groups only if ACLs retrieval is successful if (aclRetrievalResult) { return resolveSharePointGroups(webState); } else { LOGGER.log(Level.WARNING, "No documents will be sent for site [ " + webState.getWebUrl() + " ] as ACL retrieval has failed. Please check the errors/logs" + " associated with ACL retrieval before this"); return false; } } /** * Resolves SharePoint Groups for WebState * @param webState for which SharePoint Groups needs to be resolved * @return boolean flag indicating if SharePoint Group Resolution for * WebState is successful. True = Success. False = Failure */ private boolean resolveSharePointGroups(WebState webState) { if (webState.getSPGroupsToResolve() == null || webState.getSPGroupsToResolve().isEmpty()) { return true; } LOGGER.log(Level.INFO, "Resolving SharePoint Groups for [" + webState.getWebUrl() + "]"); try { AclHelper aclHelper = new AclHelper(sharepointClientContext, webState.getWebUrl()); return aclHelper.resolveSharePointGroups(webState); } catch (Exception ex) { // Return false indicating that SharePoint Group Resolution is failed. LOGGER.log(Level.WARNING, "Problem while resolving groups under WebState [ " + webState.getWebUrl() + " ].", ex); return false; } } /** * Fetches the ACL for documents. * <p> * Based on the size of ACL per document, the WS response can be large and * result in {@link java.lang.OutOfMemoryError}. In such a case, the connector * will log the error * </p> * * @param resultSet The list of documents for which ACL should be fetched. * @param webState The web state representing the site * @param globalState The global state representing the list of all sites and * their information * @return True if ACL was retrieved successfully OR false in case of any * exceptions/errors */ private boolean fetchACLForDocuments(SPDocumentList resultSet, WebState webState, GlobalState globalState) { if (resultSet.size() <= 0) { LOGGER.log(Level.CONFIG, "Result set is empty. No documents to fetch ACL"); return false; } LOGGER.log(Level.INFO, "Fetching ACLs for #" + resultSet.size() + " documents crawled from web " + webState.getWebUrl()); try { AclHelper aclHelper = new AclHelper(sharepointClientContext, webState.getWebUrl()); aclHelper.fetchAclForDocuments(resultSet, webState); } catch (Throwable t) { logError(resultSet, webState, t); // Return false indicating that the ACL retrieval for current batch // has failed and skipped return false; } // Return true indicating successful retrieval of ACL return true; } /** * Common method to log ACL retrieval errors * * @param resultSet The document list for which ACL retrieval was attempted * @param te The error/exception encountered */ private void logError(SPDocumentList resultSet, WebState webState, Throwable te) { // Check for OOM and indicate that connector service needs to be // restarted if (te instanceof OutOfMemoryError) { LOGGER.log(Level.SEVERE, "Connector encountered fatal error : \"OutOfMemoryError\" which might be due to a large web service response while fetching ACL for " + resultSet.size() + " documents for documents crawled under WebState [ " + webState.getWebUrl() + " ]. Please enable 'fetchACLInBatches' flag and modify the 'aclBatchSizeFactor' in connectorInstance.xml and restart the connector service", te); } else { LOGGER.log(Level.WARNING, "Problem while fetching ACLs for documents crawled under WebState [ " + webState.getWebUrl() + " ] . ", te); } LOGGER.warning("Skipping ACL retrieval for the document list : " + resultSet.toString()); } /** * Fetches ACL for documents in batches. Required to handle the * {@link OutOfMemoryError} kind errors * <ul> * <li>When re-fetching ACLs, tries to fetch in smaller batches of * n/batchSizeFactor (n being he number of documents).</li> * </ul> * * @param resultSet The set of documents whose ACL needs to be re-fetched in * smaller batches * @param webState The {@link WebState} to which the documents belong * @param globalState The {@link GlobalState} required primarily for the * {@link SPDocumentList} * @param batchSize Batch size to be used for fetching ACLs in batches * @return True if ACLs were retrieved successfully OR false in case of any * exceptions/errors */ /* * The access method is package level for JUnit test cases */ boolean fetchACLInBatches(SPDocumentList resultSet, WebState webState, GlobalState globalState, int batchSize) { if (resultSet.size() <= 0) { LOGGER.log(Level.CONFIG, "Result set is empty. No documents to fetch ACL"); return false; } LOGGER.info("The connector will attempt to fetch ACLs for documents in batches of " + batchSize); int toIndex = 0; for (int i = 0; i < resultSet.size(); i += batchSize) { // Use the batchSize to identify the subset of docs. The toIndex // indicates the end of sub-set with 'i' indicating the start. toIndex += batchSize; if (toIndex > resultSet.size()) { toIndex = resultSet.size(); // In case the start and end index is same it will result in an // empty list. So ignore and proceed to next level if (i == toIndex) { LOGGER.log(Level.WARNING, "The start and end index of the List of the documents should not be same"); continue; } } SPDocumentList docList = new SPDocumentList(resultSet.getDocuments().subList(i, toIndex), globalState); // Fetch ACL if (!fetchACLForDocuments(docList, webState, globalState)) { // Return false indicating ACL retrieval has failed and the // entire batch of documents need to be skipped return false; } } return true; } /** * Discover extra webs viz, MySites, Personal Sites, GSSiteDiscover discovered * sites etc and store them into allSites. * * @param allSites * @param spType * @throws SharepointException */ private void discoverExtraWebs(final Set<String> allSites, final SPType spType) throws SharepointException { // TODO: Move this to the client factory. if (SPType.SP2003 == spType) { LOGGER.log(Level.INFO, "Getting the initial list of MySites/Personal " + "sites for SharePoint type SP2003. Context URL [ " + sharepointClientContext.getSiteURL() + " ]"); final UserProfile2003Helper userProfile = new UserProfile2003Helper(sharepointClientContext); if (userProfile.isSPS()) {// Check if SPS2003 or WSS 2.0 try { final Set<String> personalSites = userProfile.getPersonalSiteList(); allSites.addAll(personalSites); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to get MySites for the Context URL [ " + sharepointClientContext.getSiteURL() + " ]", e); } } } else if (SPType.SP2007 == spType) { final String strMySiteURL = sharepointClientContext.getMySiteBaseURL(); if ((strMySiteURL != null) && (!strMySiteURL.trim().equals(""))) { LOGGER.log(Level.INFO, "Getting the initial list of MySites for SharePoint type SP2007 from MySiteBaseURL [ " + strMySiteURL + " ]"); final UserProfile2007Helper userProfile = new UserProfile2007Helper(sharepointClientContext); if (userProfile.isSPS()) { try { final Set<String> lstMyLinks = userProfile.getMyLinks(); allSites.addAll(lstMyLinks);// remove duplicates } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to get MySites from MySiteBaseURL [ " + strMySiteURL + " ]", e); } try { final Set<String> personalSites = userProfile.getPersonalSiteList(); allSites.addAll(personalSites); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to get Personal Sites for Context URL [ " + sharepointClientContext.getSiteURL() + " ]", e); } } } // Get all top level sites from the farm. Supported only in SP2007. final SiteDiscoveryHelper siteDiscovery = new SiteDiscoveryHelper(sharepointClientContext, null); final Set<String> sitecollection = siteDiscovery.getMatchingSiteCollections(); allSites.addAll(sitecollection); } } /** * iterate through fresh list of webs in allSites and update GS (i.e. add WS * if not there already) * * @param globalState * @param allSites * @return a set of all new webs that have been added to the globalstate */ private Set<WebState> updateGlobalState(final GlobalState globalState, final Set<String> allSites) { Set<WebState> newWebs = new HashSet<WebState>(); if ((null == allSites) || (allSites.size() == 0)) { return newWebs; } for (String url : allSites) { final WebState webState = updateGlobalState(globalState, url); if (null != webState) { newWebs.add(webState); } } return newWebs; } /** * Check for a web if it exists in the global state. If not, then creates a * corresponding web state and adds it into the global state. * * @param globalState * @param url * @return {@link WebState} null if the webstate was already existing in the * globalstate. Otherwise a valid reference to the newly created * WebState */ private WebState updateGlobalState(final GlobalState globalState, final String url) { WebState web = null; if (null == url) { LOGGER.log(Level.WARNING, "url not found!"); return web; } String webUrl = url; WebState wsGS = globalState.lookupWeb(url, null); /* * The incoming url might not always be exactly the web URL that is used * while creation of web state and is required by Web Services as such. * Hence, a second check is required. */ if (null == wsGS) { final String webAppURL = Util.getWebApp(url); WebsHelper webs = null; try { sharepointClientContext.setSiteURL(webAppURL); webs = new WebsHelper(sharepointClientContext); } catch (final Exception e) { LOGGER.log(Level.WARNING, "WebsHelper creation failed for URL [ " + url + " ]. ", e); } if (null != webs) { webUrl = webs.getWebURLFromPageURL(url); if (!url.equals(webUrl)) { wsGS = globalState.lookupWeb(webUrl, null); } } } if (null == wsGS) {// new web LOGGER.config("Making WebState for : " + webUrl); try { int responseCode = sharepointClientContext .checkConnectivity(Util.encodeURL(webUrl) + SPConstants.LISTS_END_POINT, null); if (responseCode != 400 && responseCode != 404) { web = globalState.makeWebState(sharepointClientContext, webUrl); } else { LOGGER.warning("Unable to connect to list web service for web. " + "Skipping WebState creation for URL [ " + webUrl + " ]."); sharepointClientContext.logExcludedURL("[ " + webUrl + " ] identified as invalid Web Url"); } } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem while creating web state for url [ " + webUrl + " ]. ", e); } } else { wsGS.setExisting(true); } return web; } /** * Discovers the child sites, MySites, Personal Sites, Sites discovered by * GSSite discovery. State information is updated as and when the webs are * discovered. A further call to updateWebStateFromSite is made to discover * the lists/libraries and the documents from each discovered web. * * @param globalState The recent state information */ // FIXME SharePointClientContext should not be passed as an argument in the // methods that are called from here. Instead, use the class member. public void updateGlobalState(final GlobalState globalState) throws SharepointException { if (globalState == null) { LOGGER.warning("global state does not exist"); return; } if (sharepointClientContext == null) { LOGGER.warning("sharepointClientContext is not found"); return; } SharepointClientContext tempCtx = (SharepointClientContext) sharepointClientContext.clone(); SiteDiscoveryHelper webCrawlInfoFetcher = null; if (sharepointClientContext.isUseSPSearchVisibility()) { webCrawlInfoFetcher = new SiteDiscoveryHelper(tempCtx, null); } // At the start of a new traversal cycle, we update the WebCrawlInfo of // all the webs if (globalState.isBFullReCrawl() && null != webCrawlInfoFetcher) { webCrawlInfoFetcher.updateWebCrawlInfoInBatch(globalState.getAllWebStateSet()); } nDocuments = 0; doCrawl = true; ListState nextList = globalState.getLastCrawledList(); WebState nextWeb = globalState.getLastCrawledWeb(); if (null == nextWeb) { nextWeb = globalState.lookupWeb(sharepointClientContext.getSiteURL(), sharepointClientContext); } else { sharepointClientContext.setSiteURL(nextWeb.getWebUrl()); } // start and end recrawl is used for detecting non-existent webs/lists globalState.startRecrawl(); if (null == nextWeb) { nextWeb = updateGlobalState(globalState, sharepointClientContext.getSiteURL()); if (null == nextWeb) { throw new SharepointException("Starting WebState for the current traversal can not be determined."); } if (null != webCrawlInfoFetcher) { nextWeb.setWebCrawlInfo(webCrawlInfoFetcher.getCurrentWebCrawlInfo()); } } LOGGER.info("Starting traversal from site [ " + nextWeb + " ]. "); SPType spType = nextWeb.getSharePointType(); // To store the intermediate webs discovered during crawl Set<String> allSites = new TreeSet<String>(); ArrayList<String> lstLookupForWebs = new ArrayList<String>(); // Traverse sites and lists from the last crawled site and list to fetch // batch hint # of docs nextWeb = traverseSites(globalState, allSites, tempCtx, nextWeb, nextList, lstLookupForWebs); // This will contain all the newly discovered webs and is used to // identify those webs which should be queried for their search // visibility options set on SharePoint. Set<WebState> newWebs = new HashSet<WebState>(); // Update all the web info into the globalstate. The newly discovered // webs, if any, will be processed in the same batch traversal in case // the batch hint # of documents have not been discovered newWebs.addAll(updateGlobalState(globalState, allSites)); // Cases being handled here: // 1. Batch hint # of documents have not been discovered, but there are // new sites which have been discovered. Crawl documents till you get // the batch hint # of docs // 2. Batch hint # of documents have not been discovered and no new // sites have been discovered. In such cases get any new // personal/mysites, sites discovered by GSS. Add them to the global // state and crawl them till batch hint # of documents is reached. if (doCrawl && spType != null) { // If the first check has passed, it might mean Case 1. If the // following if block is skipped, it means this is Case 1, else it // will be Case 2 if (newWebs.size() == 0) { // If this check passed, it means Case 2 if (LOGGER.isLoggable(Level.CONFIG)) { LOGGER.log(Level.CONFIG, "Discovering new sites"); } // Empty the current set of sites that have been traversed // before discovering the new ones. This is important in case // the current batch traversal has not discovered batch-hint no. // of docs. In such cases the connector should not traverse the // sites already traversed in the same batch traversal. allSites.clear(); // Initiate the discovery of new sites discoverExtraWebs(allSites, spType); newWebs.addAll(updateGlobalState(globalState, allSites)); } // The following does not care if the sites are discovered for Case // 1 or Case 2. It will simply go ahead and crawl batch hint no. of // docs from the new sites if (newWebs.size() > 0) { LOGGER.log(Level.INFO, "global state has been updated with #" + newWebs.size() + " newly discovered sites. About to traverse them for docs"); if (null != webCrawlInfoFetcher) { webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs); } // Traverse sites and lists under them to fetch batch hint # of // docs traverseSites(globalState, allSites, tempCtx, nextWeb, nextList, lstLookupForWebs); newWebs.clear(); // There are chances that new sites are discovered (child sites // OR linked sites) during the traversal of sites discovered as // linked sites themselves OR as child sites OR through GSS. In // such cases, the connector should just create webstates and // add them to the global state. The next batch traversal will // take them up for traversal newWebs.addAll(updateGlobalState(globalState, allSites)); if (newWebs.size() > 0) { if (null != webCrawlInfoFetcher) { webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs); } doCrawl = false; } } } else if (newWebs.size() > 0 && null != webCrawlInfoFetcher) { // This is the case when we have reached the batch-hint while // crawling the first web itself and hence no further discovery // has been done. At this point, we must update the WebcrawlInfo of // all the child/linked sites that might have been discovered as // part of the site's crawling. If we do not do this here, these // webs will become known webs in the next batch traversal and we do // not query WebCrawlInfo of known webs in between a traversal // cycle. webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs); } globalState.setBFullReCrawl(doCrawl); globalState.endRecrawl(sharepointClientContext); if (null != sharepointClientContext.getUserDataStoreDAO() && sharepointClientContext.getUserDataStoreDAO().getUdsCacheSize() > 0) { sharepointClientContext.getUserDataStoreDAO().cleanupCache(); } LOGGER.log(Level.INFO, "Returning after crawl cycle.. "); } public boolean isDoCrawl() { return doCrawl; } /** * Makes a call to WSClient layer to get the alerts for a site and updates the * global state. Alerts, in SharePoint are created at web level. Though, in * the state file that connector maintains a SPDoc can only be inside a * ListState. Hence, we need to create a dummy list here. ListID = * siteName_Alerts: to make it unique for alerts and LastMod: current time * * @param webState * @param tempCtx */ private void processAlerts(final WebState webState, final SharepointClientContext tempCtx) { if (null == webState) { return; } String internalName = webState.getPrimaryKey(); if (!internalName.endsWith("/")) { internalName += "/"; } internalName += "_" + SPConstants.ALERTS_TYPE; final Calendar cLastMod = Calendar.getInstance(); cLastMod.setTime(new Date()); ListState currentDummyAlertList = null; try { currentDummyAlertList = new ListState(internalName, SPConstants.ALERTS_TYPE, SPConstants.ALERTS_TYPE, cLastMod, SPConstants.ALERTS_TYPE, internalName, webState); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to create the dummy list state for alerts. ", e); return; } if (currentDummyAlertList == null) { LOGGER.log(Level.WARNING, "Unable to create the dummy list state for alerts."); return; } // find the list in the Web state ListState dummyAlertListState = webState.lookupList(currentDummyAlertList.getPrimaryKey()); if (dummyAlertListState == null) { dummyAlertListState = currentDummyAlertList; } LOGGER.log(Level.INFO, "Getting alerts. internalName [ " + internalName + " ] "); List<SPDocument> listCollectionAlerts = null; try { final AlertsHelper alerts = new AlertsHelper(tempCtx); listCollectionAlerts = alerts.getAlerts(webState, dummyAlertListState); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem while getting alerts. ", e); } if (dummyAlertListState.isExisting()) { webState.AddOrUpdateListStateInWebState(dummyAlertListState, currentDummyAlertList.getLastMod()); dummyAlertListState.setCrawlQueue(listCollectionAlerts); if (listCollectionAlerts != null) { nDocuments += listCollectionAlerts.size(); } } } /** * Gets all the docs from the SPDocument Library and all the items and their * attachments from Generic Lists and Issues in sharepoint under a given site. * It first calls SiteData web service to get all the Lists. And then calls * Lists web service to get the list items for the lists which are of the type * SPDocument Library, Generic Lists or Issues. For attachments in Generic * List items and Issues, it calls Lists web service to get attachments for * these list items. * * @param tempCtx Current connector context * @param webState The state information of the web which is to be crawled for * documents * @param nextList Last List traversed. If the current web contains this list, * the traversal will start from here. * @param allWebs Contains all the webs that has been discovered from link * sites/Site directory. */ private void updateWebStateFromSite(final SharepointClientContext tempCtx, final WebState webState, ListState nextList, final Set<String> allWebs) throws SharepointException { List<SPDocument> listItems = new ArrayList<SPDocument>(); // get all the lists for the given web // e.g. picture,wiki,document // libraries etc. final SiteDataHelper siteData = new SiteDataHelper(tempCtx); List<ListState> listCollection = siteData.getNamedLists(webState); // Remove duplicate lists, if any. // TODO: We do not need to do this. Web Service does not return // duplicate lists. listCollection = new ArrayList<ListState>(new TreeSet<ListState>(listCollection)); try { SiteDiscoveryHelper gssd = new SiteDiscoveryHelper(tempCtx, webState.getWebUrl()); gssd.updateListCrawlInfo(listCollection); } catch (Exception e) { LOGGER.log(Level.WARNING, "Exception occurred when trying to to update the ListCrawlInfo for web [ " + webState.getWebUrl() + " ] ", e); } // Updating the latest metadata info for all list states. We may do this // updation when the crawl will begin; that will save this extra // iteration over the ListStates. But, there is one metadata which // must be updated before the change (ACL) detection and crawl begins. // That metadata is ListState.InheritiedSecurity flag which is very // important while processing ACL related changes. // TODO: with some re-structuring of code, we can still avoid this extra // iteration. for (ListState currentListState : listCollection) { ListState listState = webState.lookupList(currentListState.getPrimaryKey()); if (null != listState) { if (!listState.getListURL().equalsIgnoreCase(currentListState.getListURL())) { tempCtx.logToFile(SPConstants.DEFAULT_VIEW_URL_CHANGE_LOG, listState.getListURL()); } listState.updateList(currentListState); } } /* * If the nextList belongs the current web and is still existing on the * SharePoint site, start traversing for this list onwards. */ if (null != nextList && nextList.getParentWebState().equals(webState) && listCollection.contains(nextList)) { Collections.rotate(listCollection, -(listCollection.indexOf(nextList))); } AclHelper aclHelper = new AclHelper(tempCtx, webState.getWebUrl()); try { aclHelper.fetchAclChangesSinceTokenAndUpdateState(webState); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem Interacting with Custom ACL WS. web site [ " + webState.getWebUrl() + " ]. ", e); } List<SPDocument> aclChangedItems = null; final ListsHelper listsHelper = new ListsHelper(tempCtx); for (int i = 0; i < listCollection.size(); i++) { final ListState currentList = listCollection.get(i); ListState listState = webState.lookupList(currentList.getPrimaryKey()); if (sharepointClientContext.isUseSPSearchVisibility()) { // If this list is marked for No Crawling, do not crawl this // list. // Please note that, if this list is already known to the // connector, it'll keep existing in the connector's state. This // implies that if a list is marked as NoCrawl list on // SharePoint in between the connector's traversal, crawling of // this list will be paused at whatever state it is in. As soon // as the NoCrawl flag on SharePoint is reverted, the crawling // will be resumed from the saved state. if (currentList.isNoCrawl()) { LOGGER.log(Level.WARNING, "Skipping List URL [ " + currentList.getListURL() + " ] while crawling because it has been marked for No Crawling on SharePoint. "); if (null == listState) { // Make this list known by keeping it in the state. But, // do not crawl webState.AddOrUpdateListStateInWebState(currentList, currentList.getLastMod()); } continue; } } /* * If we already knew about this list, then only fetch docs that have * changed since the last doc we processed. If it's a new list (e.g. the * first SharePoint traversal), we fetch everything. */ if (listState == null) { listState = currentList; listState.setNewList(true); webState.AddOrUpdateListStateInWebState(listState, listState.getLastMod()); LOGGER.info("discovered new listState. List URL: " + listState.getListURL()); if (SPType.SP2007 == webState.getSharePointType()) { if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType()) { // In case of content feed, we need to keep track of // folders and the items under that. This is required // for sending delete feeds for the documents when their // parent folder is deleted. LOGGER.log(Level.CONFIG, "Discovering all folders under current list/library [ " + listState.getListURL() + " ] "); try { listsHelper.getSubFoldersRecursively(listState, null, null); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception occured while getting the folders hierarchy for list [ " + listState.getListURL() + " ]. ", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Error occured while getting the folders hierarchy for list [ " + listState.getListURL() + " ]. ", t); } } try { listItems = listsHelper.getListItemChangesSinceToken(listState, allWebs); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ " + listState.getListURL() + " ].", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ " + listState.getListURL() + " ].", t); } } else { try { listItems = listsHelper.getListItems(listState, null, null, allWebs); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ " + listState.getListURL() + " ].", e); } } } else { LOGGER.info("revisiting listState [ " + listState.getListURL() + " ]. "); listState.setExisting(true); listState.setNextPage(null); String lastDocID = null; SPDocument lastDoc = listState.getLastDocForWSRefresh(); /* * We must ensure that the last doc that we are using was actually sent * as ADD feed and not as DELETE feed. It might be possible that in one * cycle we identify a list as non-existing and hence started sending * delete feeds for it. But, in the next cycle that list has been * restored, in that case we can not rely on the lastDoc which has been * set by a delete feed. We also need to reset the change token in that * case to start a full crawl. */ if (lastDoc != null) { if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType() && ActionType.DELETE.equals(lastDoc.getAction())) { listState.resetState(); if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType()) { // In case of content feed, we need to keep track of // folders and the items under that. This is // required for sending delete feeds for the // documents when their parent folder is deleted. LOGGER.log(Level.CONFIG, "Discovering all folders under current list/library [ " + listState.getListURL() + " ] "); try { listsHelper.getSubFoldersRecursively(listState, null, null); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception occured while getting the folders hierarchy for list [ " + listState.getListURL() + " ]. ", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Error occured while getting the folders hierarchy for list [ " + listState.getListURL() + " ]. ", t); } } LOGGER.info("recrawling the items under listState [ " + listState.getListURL() + " ] because this list has been restored after deletion."); } else { lastDocID = Util.getOriginalDocId(lastDoc.getDocId(), sharepointClientContext.getFeedType()); } } if (SPType.SP2007.equals(webState.getSharePointType())) { try { webState.AddOrUpdateListStateInWebState(listState, currentList.getLastMod()); // Any documents to be crawled because of ACL Changes aclChangedItems = aclHelper.getListItemsForAclChangeAndUpdateState(listState, listsHelper); if (null == aclChangedItems || aclChangedItems.size() < sharepointClientContext.getBatchHint()) { // Do regular incremental crawl listItems = listsHelper.getListItemChangesSinceToken(listState, allWebs); } } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ " + listState.getListURL() + " ].", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ " + listState.getListURL() + " ].", t); } } else { try { final Calendar dateSince = listState.getDateForWSRefresh(); webState.AddOrUpdateListStateInWebState(listState, currentList.getLastMod()); LOGGER.info("fetching changes since " + Util.formatDate(dateSince) + " for list [ " + listState.getListURL() + " ]. "); // check if date modified for the document library final Calendar dateCurrent = listState.getLastModCal(); if (dateSince.before(dateCurrent)) { listState.setNewList(true); } listItems = listsHelper.getListItems(listState, dateSince, lastDocID, allWebs); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ " + listState.getListURL() + " ].", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ " + listState.getListURL() + " ].", t); } } } // Get the attachments for each discovered items, if the list allows // attachments if (listState.canContainAttachments() && (listItems != null)) { final List<SPDocument> attachmentItems = new ArrayList<SPDocument>(); for (int j = 0; j < listItems.size(); j++) { final SPDocument doc = listItems.get(j); if (ActionType.ADD.equals(doc.getAction())) { final List<SPDocument> attachments = listsHelper.getAttachments(listState, doc); attachmentItems.addAll(attachments); } } listItems.addAll(attachmentItems); } if (listState.getNextPage() == null) { if (((listItems != null) && (listItems.size() > 0)) || (listState.isNewList())) { SPDocument listDoc = listState.getDocumentInstance(sharepointClientContext.getFeedType()); listItems.add(listDoc); listState.setNewList(false); } } else { // Send List home page as part of this batch to complete inheritance // chain for discovered child items for partially traversed List. if (listState.isNewList() && listItems != null && listItems.size() > 0 && sharepointClientContext.getTraversalContext().supportsInheritedAcls() && !Strings.isNullOrEmpty(listState.getListItemCollectionPositionNext())) { SPDocument listDoc = listState.getDocumentInstance(sharepointClientContext.getFeedType()); listItems.add(listDoc); } // If any of the list has not been traversed completely, doCrawl // must not be set true. doCrawl = false; } // Add aclChangedItems to the docs crawled under regular crawling. // This is the right place to do this because all the operations // pertaining to regular crawling have been made. But, the // batch-hint check is yet to be done if (null != aclChangedItems) { if (null != listItems) { listItems.addAll(aclChangedItems); } else { listItems = aclChangedItems; } } listState.setCrawlQueue(listItems); // Set the last crawled date time. This is informative value for the // user viewing the state file listState.setLastCrawledDateTime(Util.getCurrentTimestampString()); if (null == listItems || listItems.size() == 0) { LOGGER.log(Level.CONFIG, "No items found from list " + listState); } else { Collections.sort(listItems); LOGGER.log(Level.INFO, "found " + listItems.size() + " items from list " + listState); nDocuments += listItems.size(); final int batchHint = sharepointClientContext.getBatchHint(); // As per Issue 116 we need to stop at batchHint or a little // more if (nDocuments >= batchHint) { doCrawl = false; break; } } } // end:; for Lists // Set the last crawled date time. This is informative value for the // user viewing the state file webState.setLastCrawledDateTime(Util.getCurrentTimestampString()); // Mark the current list as null so that the next time crawl queues are // scanned, all the ListStates are traversed and no documents that have // just been discovered gets skipped. webState.setCurrentList(null); } /** * Traverses list of sites (webstates) which have not yet been crawled and * discovers new docs to be sent to GSA * * @param globalState The global state which has the list of sites (webstates) * that need to be crawled for documents * @param allSites The list of sites * @param sharePointClientContext The current connector context. Instance of * {@link SharepointClientContext} * @param nextWeb last site (webstate) that was crawled * @param nextList last liststate that as crawled * @param lstLookupForWebs webs which are already traversed and should not be * traversed again * @throws SharepointException In case of any problems fetching documents * @return Last Web crawled. This helps caller an idea about from where the * next crawl should begin. */ // TODO: Why do we pass SharePointClientContext object as argument here? // It's already available as a member of this class. Is there any // intentional differences between the states of these two // SharePointClientContexts? private WebState traverseSites(GlobalState globalState, Set<String> allSites, SharepointClientContext sharePointClientContext, WebState nextWeb, ListState nextList, ArrayList<String> lstLookupForWebs) throws SharepointException { globalState.setCurrentWeb(nextWeb); final Iterator<WebState> itWebs = globalState.getCircularIterator(); while (itWebs.hasNext()) { WebState ws = itWebs.next(); // Get the first web if (ws == null) { continue; } final String webURL = ws.getPrimaryKey(); // Note: Lookup table maintains keeps track of the links which has // been visited till now. // This helps to curb the cyclic link problem in which SiteA can // have link to SiteB and SiteB having link to SiteA. if (lstLookupForWebs.contains(webURL)) { continue; } else { lstLookupForWebs.add(webURL); } try { sharePointClientContext.setSiteURL(webURL); } catch (Exception e) { LOGGER.log(Level.WARNING, "Exception occurred when trying to set the webUrl [ " + webURL + " ] context", e); continue; } if (sharepointClientContext.isUseSPSearchVisibility()) { // Even if a web is not crawled due to the SP search visibility, // it's reference is kept in the connector's state. This is to // avoid unnecessary discovery (and WebState construction) of // these webs again and again. if (ws.isNoCrawl()) { LOGGER.log(Level.WARNING, "Skipping Web URL [ " + webURL + " ] while crawling because it has been marked for No Crawling on SharePoint. "); continue; } } nextWeb = ws; LOGGER.config("Crawling site [ " + webURL + " ] "); final int currDocCount = nDocuments; try { // Process the web site, and add the link site info to allSites. updateWebStateFromSite(sharePointClientContext, ws, nextList, allSites); if (currDocCount == nDocuments) { // get Alerts for the web and update webState. The above // check is added to reduce the frequency with which // getAlerts WS call is made. LOGGER.fine("Getting alerts under site [ " + webURL + " ]"); processAlerts(ws, sharePointClientContext); } ListState listForWeb = ws.lookupList(ws.getPrimaryKey()); if (listForWeb != null) { LOGGER.fine("List State for web [ " + listForWeb.getListURL() + " ] is not null. Last Doc from List State is " + listForWeb.getLastDocProcessed()); } boolean isFirstBatch = ((listForWeb == null) || (listForWeb.getLastDocProcessed() == null)); // Crawl the site home page and web application policy in the // first batch and when a web application policy change is detected. if (ws.isWebApplicationPolicyChange() || isFirstBatch) { // Get site data for the web and update webState. LOGGER.fine("Getting landing page data for the site [ " + webURL + " ]"); processSiteData(ws, sharepointClientContext); } } catch (final Exception e) { LOGGER.log(Level.WARNING, "Following exception occured while traversing/updating web state URL [ " + webURL + " ]. ", e); } catch (final Throwable t) { LOGGER.log(Level.WARNING, "Following error occured while traversing/updating web state URL [ " + webURL + " ]. ", t); } // Check if the threshold (i.e. batchHint is reached) final int batchHint = sharepointClientContext.getBatchHint(); // As per Issue 116 we need to stop at batchHint or a little more if (nDocuments >= batchHint) { LOGGER.info("Stopping crawl cycle as connector has discovered (>= batchHint) # of docs. In total : " + nDocuments + " docs. batch-hint is " + batchHint); doCrawl = false; break; } // Get the next web and discover its direct children sharepointClientContext.setSiteURL(webURL); WebsHelper webs = new WebsHelper(sharepointClientContext); try { final Set<String> allWebStateSet = webs.getDirectChildsites(); final int size = allWebStateSet.size(); if (size > 0) { LOGGER.log(Level.INFO, "Discovered " + size + " child sites under [ " + webURL + "]. "); } else { LOGGER.log(Level.CONFIG, "Discovered " + size + " child sites under [ " + webURL + "]. "); } allSites.addAll(allWebStateSet); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to get the Child sites for site " + webURL, e); } } return nextWeb; } /** * Returns the no of visited list states to check for pending docs from * previous batch traversal for a given web state (site) * * @return The no of visited list states */ public int getNoOfVisitedListStates() { return noOfVisitedListStates; } /** * Makes a call to SiteData web service to get data for a site and update * global state. Site data in SharePoint is created at site level. Though, in * the state file that connector maintains a SPDocument can only be inside a * ListState. Hence we need to create a dummy list here. * * @param webState for which SPDcocument needs to be constructed. * @param tempCtx is the temporary SharepointClientContext object. */ private void processSiteData(final WebState webState, final SharepointClientContext tempCtx) { if (null == webState) { return; } final Calendar cLastMod = Calendar.getInstance(); cLastMod.setTime(new Date()); ListState currentDummySiteDataList = null; try { currentDummySiteDataList = new ListState(webState.getPrimaryKey(), webState.getTitle(), webState.getPrimaryKey(), cLastMod, SPConstants.SITE, webState.getPrimaryKey(), webState); } catch (final Exception e) { LOGGER.log(Level.WARNING, "Unable to create the dummy list state for site. " + webState.getWebUrl(), e); return; } // find the list in the Web state ListState dummySiteListState = webState.lookupList(currentDummySiteDataList.getPrimaryKey()); if (dummySiteListState == null) { dummySiteListState = currentDummySiteDataList; } LOGGER.log(Level.INFO, "Getting site data. internalName [ " + webState.getWebUrl() + " ] "); List<SPDocument> documentList = new ArrayList<SPDocument>(); SPDocument document = null; try { // SharePoint Client Context used to create SiteDataWS should point to // WebState URL. If not then SharePoint default page will point to // incorrect Web ID for Web State. SharepointClientContext ctxToPass = (SharepointClientContext) tempCtx.clone(); ctxToPass.setSiteURL(webState.getWebUrl()); final SiteDataHelper siteData = new SiteDataHelper(ctxToPass); // need to check whether the site exist or not and is not null if (webState.isExisting() && null != webState) { document = siteData.getSiteData(webState); document.setParentList(dummySiteListState); // Site Home Page document will be added as last doc from // dummy list state. This is required for sending delete feed. } } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem while getting site data. ", e); } // Web Application Policy Document processing. // Web Application Policy Document will be associated with each webstate. if (sharepointClientContext.isPushAcls()) { try { AclHelper aclHelper = new AclHelper(sharepointClientContext, webState.getWebUrl()); SPDocument webAppPolicy = aclHelper.getWebApplicationPolicy(webState, sharepointClientContext.getFeedType().toString()); if (webAppPolicy != null) { webAppPolicy.setParentList(dummySiteListState); documentList.add(webAppPolicy); } } catch (final Exception e) { LOGGER.log(Level.WARNING, "Problem while getting web app policy. ", e); } } if ((dummySiteListState.isExisting() || webState.isWebApplicationPolicyChange()) && null != document) { // Mark dummy list state to true in order to differentiate this list state // with // other lists in web state. //adding list page document. documentList.add(document); dummySiteListState.setSiteDefaultPage(true); webState.AddOrUpdateListStateInWebState(dummySiteListState, currentDummySiteDataList.getLastMod()); dummySiteListState.setCrawlQueue(documentList); // Resetting web application policy change flag. This will ensure // same webstate will not be processed again // for web application policy change. webState.setWebApplicationPolicyChange(false); } nDocuments += documentList.size(); } }