Java tutorial
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * Copyright (C) 2015 George Antony Papadakis (gpapadis@yahoo.gr) */ package BlockBuilding; import DataStructures.BilateralBlock; import DataStructures.EntityProfile; import DataStructures.UnilateralBlock; import Utilities.Converter; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** * @author gap2 */ public abstract class AbstractExtendedSortedNeighborhoodBlocking extends AbstractSortedNeighborhoodBlocking { public AbstractExtendedSortedNeighborhoodBlocking(int w, List<EntityProfile>[] profiles) { super(w, "Memory-based Extended Sorted Neighborhood Blocking", profiles); } public AbstractExtendedSortedNeighborhoodBlocking(int w, String[] entities, String[] index) { super(w, "Disk-based Extended Sorted Neighborhood Blocking", entities, index); } @Override protected void parseIndex() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); final Set<String> blockingKeysSet = getTerms(d1Reader); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms); //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIds = Utilities.getDocumentIds(d1Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds = new HashSet<>(); for (int j = 0; j < windowSize; j++) { entityIds.addAll(getTermEntities(documentIds, d1Reader, sortedTerms[i + j])); } if (1 < entityIds.size()) { int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock uBlock = new UnilateralBlock(idsArray); blocks.add(uBlock); } } noOfEntities = new double[1]; noOfEntities[0] = d1Reader.numDocs(); Utilities.closeReader(d1Reader); } @Override protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms); //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { try { int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j])); } docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds2.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j])); } } catch (IOException ex) { ex.printStackTrace(); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } noOfEntities = new double[2]; noOfEntities[0] = d1Reader.numDocs(); noOfEntities[1] = d2Reader.numDocs(); Utilities.closeReader(d1Reader); Utilities.closeReader(d2Reader); } }