A-Priori and PCY algorithms implementation using java – Mining Frequent Itemsets
The main objective of this project is to find frequent itemsets by implementing two efficient algorithms: A-Priori and PCY. The goal is to find frequent pairs of elements. You do not need to find triples and larger itemsets.
Dataset
The retail dataset contains anonymized retail market basket data (88K baskets) from an anonymous retail store. The preprocessing step to map text labels into integers has already been done. Use Sublime Text, TextPad or Notepad++ or other software to open the file. Do not use Notepad.
Experiments
Perform the scalability study for finding frequent pairs of elements by dividing the dataset into different chunks and measure the time performance. Provide the line chart. Provide results for the following support thresholds: 1%, 5%, 10%. For example, if your chuck is 10% of the dataset, you have around 8,800 baskets. Therefore, if your support threshold is 5%, you should count the pairs that appear in at least 440 baskets. See three samples below for three different support thresholds. Note, the sample charts contain hypothetical numbers!
Answer:
Tester Class
import java.io.*; public class Tester { public static void main(String[] args) throws IOException { long first,last,time_Diff; int[] percent = {1, 5, 10}; for (int per : percent) { for (int i = 1; i <= 12; i++) { String retailInput = "retail" + i + ".txt"; int size = buckcount(retailInput); System.out.println("\nInput file "+i +": "+retailInput); System.out.println("Bucket " + size + " read in at " + per + "% support"); int threshold = calc(size, per); first = System.currentTimeMillis(); APriori.setThreshold(threshold); APriori.toFindPair(retailInput); last = System.currentTimeMillis(); time_Diff = last - first; System.out.println("APriori: " + time_Diff + "ms."); first = System.currentTimeMillis(); PCY.setLimit(threshold); PCY.toFindPair(retailInput); last = System.currentTimeMillis(); time_Diff = last - first; System.out.println("PCY: " + time_Diff + "ms."); System.out.println("Results: " ); System.out.println(APriori.toFindPair(retailInput)); } } } /*method to count the buckets */ public static int buckcount(String retail) throws IOException { InputStream input = new BufferedInputStream(new FileInputStream(retail)); byte[] i = new byte[1024]; int readChars,n = 0; boolean empty = true; while ((readChars = input.read(i)) != -1) { empty = false; for (int k = 0; k < readChars; ++k) { //new line if (i[k] == '\n') { ++n; } } } //Check if the line is not empty return 1 otherwise return n return (n == 0 && !empty) ? 1 : n; } /*calculate the amount of support for the required percentage*/ public static int calc(int sum, double per) { if (per < 0) { return 0; } else if (per > 100) { return sum; } else { //cast to integer return (int) Math.round((per / 100) * sum); } } }
Start Class
public class Start { final Set1 key; Integer value; public Start(Set1 key, Integer value) { this.key = key; this.value = value; } public Integer getValue() { return value; } public Set1 getKey() { return key; } public void setValue(Integer value) { this.value = value; } public String toString() { return "[" + key + "=>" + value + "]"; } }
Set1 Class
import java.util.*; public class Set1<T1, T2> { //Variables public T1 o1,o2; //Constructor public Set1(T1 o1, T1 o2) { this.o1 = o1; this.o2 = o2; } //Setter public void setPair(T1 o1, T1 o2) { this.o1 = o1; this.o2 = o2; } public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Set1)) { return false; } return (o1 == ((Set1) o).o1) && (o2 == ((Set1) o).o2); } /*This hash function which is used for 2 numbers of type integer*/ public int hashCode() { int hashVar = 321; hashVar = 20 * hashVar + Objects.hashCode(this.o1); hashVar = 50 * hashVar + Objects.hashCode(this.o2); return hashVar; } public String toString() { return "(" + o1 + ", " + o2 + ")"; } }
Set2 Class
import java.util.*; /*This class is used to override the hash code used by java*/ class Set2 <T1, T2> extends Set1 <T1, T2> { public Set2(T1 o1, T1 o2) { super(o1, o2); } @Override public int hashCode() { int hashVar = 20; hashVar = 150 * hashVar + Objects.hashCode(this.o1); hashVar = 50 * hashVar + Objects.hashCode(this.o2); return hashVar; } @Override public boolean equals(Object o) { if (!(o instanceof Set1)) { return false; } if (this == o) { return true; } return (o1 == ((Set1) o).o1) && (o2 == ((Set1) o).o2); } }
Bucket Class
import java.util.*; /*This class is used to hold entries*/ public class Bucket { //Variables private Start list[]; private int maxSize = 131072; //Constructor public Bucket(int num) { maxSize = num; list = new Start[maxSize]; } public Bucket() { list = new Start[maxSize]; } //Getters public int getmaxSize() { return maxSize; } public Integer get(Set1 key) { int hashVar = Math.abs(key.hashCode() % maxSize); Start start = list[hashVar]; return start.getValue(); } /*This is used to convert To BitSet*/ public BitSet toBit(Integer thresh) { BitSet vector = new BitSet(maxSize); for (int i = 0; i < maxSize; i++) { if (list[i] != null && list[i].getValue() >= thresh) { vector.set(i); } } return vector; } /*To convert to a bitvector */ public void input(Set1 key, Integer value) { int hashVar = Math.abs(key.hashCode() % maxSize); Start start = list[hashVar]; if (start == null) { Start newStart = new Start(key, value); list[hashVar] = newStart; } else { start.value = value; } } /*to checks if hash location has a key entry*/ public boolean haveKey(Set1 key) { int hash = Math.abs(key.hashCode() % maxSize); return (list[hash] != null); } //to string method to display output on the console public String toString() { String s; s = "{"; for (Start array : list) { if (array != null) { s += array; } } s = s+"}"; return s; } }
APriori Class
import java.io.*; import java.util.*; public final class APriori { private static int limit; //constructor private APriori() { limit = 0; } /*sets the threshold*/ public static void setThreshold(int max) { limit = max; } /*finds the frequent pair*/ public static LinkedHashMap<Set1, Integer> toFindPair(String retail) throws NumberFormatException, IOException { HashMap<Integer, Integer> ItemFrequent = AprioriPassOne(retail); LinkedHashMap<Set1,Integer> canBeSet = new LinkedHashMap<>(); String numsLine[],_line = ""; Integer value; ArrayList<Integer> myList = new ArrayList<>(); FileReader fileReader = new FileReader(retail); BufferedReader reader = new BufferedReader(fileReader); while ((_line = reader.readLine()) != null) { numsLine = _line.split("\\s+"); for (String num : numsLine) { value = Integer.parseInt(num); if (ItemFrequent.containsKey(value) && !myList.contains(value)) { myList.add(value); } } //use collection class through "java.util.Collections" for sorting Collections.sort(myList); for (int n = 0; n < myList.size(); n++) { for (int k = n + 1; k< myList.size(); k++) { Set1<Integer, Integer> val = new Set1<>(myList.get(n), myList.get(k)); if (canBeSet.containsKey(val)) { canBeSet.put(val, canBeSet.get(val) + 1); } else { canBeSet.put(val, 1); } } } myList.clear(); } reader.close(); removeNotFrequent(canBeSet); return canBeSet; } /* to find freq Item Counts*/ private static HashMap<Integer, Integer> AprioriPassOne(String fileName) throws NumberFormatException, IOException { HashMap<Integer, Integer> pass1 = new HashMap<>(); String numsLine[],_line = ""; Integer key; FileReader readFile = new FileReader(fileName); BufferedReader toRead = new BufferedReader(readFile); while ((_line = toRead.readLine()) != null) { numsLine = _line.split("\\s+"); for (String num : numsLine) { key = Integer.parseInt(num); if (pass1.containsKey(key)) { pass1.put(key, pass1.get(key) + 1); } else { pass1.put(key, 1); } } } toRead.close(); removeNotFrequent(pass1); return pass1; } /*method to check if item is frequent*/ private static boolean checkFrequancy(int item) { return item >= limit; } /*remove none freq item*/ private static void removeNotFrequent(HashMap<Integer, Integer> map) { Iterator<Map.Entry<Integer, Integer>> check = map.entrySet().iterator(); while (check.hasNext()) { Map.Entry<Integer, Integer> entry = check.next(); if (!(checkFrequancy(entry.getValue()))) { check.remove(); } } } /*remove none freq pair*/ private static void removeNotFrequent(LinkedHashMap<Set1, Integer> map) { Iterator<Map.Entry<Set1, Integer>> check = map.entrySet().iterator(); while (check.hasNext()) { Map.Entry<Set1, Integer> entry = check.next(); if (!checkFrequancy(entry.getValue())) { check.remove(); } } } }
PCY Class
import java.io.*; import java.util.*; public final class PCY { //Variables private static int limit,maxSize = 131072;// set the size to 2^17 private static BitSet bit; //Setters public static void setBucketSize(int bSize) { maxSize = bSize; } public static void setLimit(int p ) { limit = p ; } //Link the hash maps public static LinkedHashMap<Set1, Integer> toFindPair(String retail) throws NumberFormatException, IOException { HashMap<Integer, Integer> ItemFrequent = PCYPassOne(retail); LinkedHashMap<Set1, Integer> canBeSet = new LinkedHashMap<>(); String _line,numberOfLines[]; Integer key; ArrayList<Integer> myList = new ArrayList<>(); FileReader toReadFile = new FileReader(retail); BufferedReader reader = new BufferedReader(toReadFile); while ((_line = reader.readLine()) != null) { numberOfLines = _line.split("\\s+"); for (String toNum : numberOfLines) { /*convert to integer*/ key = Integer.parseInt(toNum); if (ItemFrequent.containsKey(key) && !myList.contains(key)) { myList.add(key); } } /*sort myList*/ Collections.sort(myList); for (int n = 0; n < myList.size(); n++) { /*create pairs*/ for (int k = n + 1; k < myList.size(); k++) { Set1<Integer, Integer> val = new Set1<>(myList.get(n), myList.get(k)); if (!bit.get(Math.abs(val.hashCode()) % maxSize)) { continue; } if (canBeSet.containsKey(val)) { canBeSet.put(val, canBeSet.get(val) + 1); } else { canBeSet.put(val, 1); } } } myList.clear(); } reader.close(); removeNotFrequent(canBeSet); /*return frequent pairs and number of support*/ return canBeSet; } /* first pass -- to find freq Item Counts */ private static HashMap<Integer,Integer> PCYPassOne(String fileName) throws NumberFormatException, IOException { //Decelerations HashMap<Integer, Integer> pass1 = new HashMap<>(); Bucket firstBucket = new Bucket(maxSize); String totalLine[], _lin = ""; Integer value; ArrayList<Integer> myList = new ArrayList<>(); FileReader readFile = new FileReader(fileName); BufferedReader reader = new BufferedReader(readFile); while ((_lin = reader.readLine()) != null) { totalLine = _lin.split("\\s+"); for (String num : totalLine) { value = Integer.parseInt(num); if (pass1.containsKey(value)) { pass1.put(value, pass1.get(value) + 1); } else { pass1.put(value, 1); } if (!myList.contains(value)) { myList.add(value); } } Collections.sort(myList); for (int n = 0; n < myList.size(); n++) { for (int k = n + 1; k < myList.size(); k++) { Set1 <Integer, Integer> val = new Set1 < >(myList.get(n), myList.get(k)); if (firstBucket.haveKey(val)) { firstBucket.input(val, firstBucket.get(val) + 1); } else { firstBucket.input(val, 1); } } } myList.clear(); } reader.close(); removeNotFrequent(pass1); bit = firstBucket.toBit(limit); firstBucket = null; return pass1; } private static boolean checkFrequancy(int num) { return num >= limit; } /*To remove none freq. Pair*/ private static void removeNotFrequent(HashMap<Integer, Integer> map) { Iterator<Map.Entry<Integer, Integer>> check = map.entrySet().iterator(); while (check.hasNext()) { Map.Entry<Integer, Integer> entry = check.next(); if (!checkFrequancy(entry.getValue())) { check.remove(); } } } /*To remove none freq. Pair*/ private static void removeNotFrequent(LinkedHashMap<Set1, Integer> map) { Iterator<Map.Entry<Set1, Integer>> check = map.entrySet().iterator(); while (check.hasNext()) { Map.Entry<Set1,Integer> entry = check.next(); if (!checkFrequancy(entry.getValue())) { check.remove(); } } } }
Leave a reply