#!/usr/bin/env python3 # Author: Dr. Robert Heckendorn, Computer Science Department, University of Idaho # Version 2b: updated Apr 9, 2019 # # KDTree simple nearest neighbor algorithm expanded to help people understand how it works # Version 2: # a) some names changed to better explain their purpose # b) a formal calling prototype more like my C++ code to clarify that best and bestex # behave more like globals and get better throughout the execution as explained in # class. # c) very simple tree data so you can see what is done in python # from math import * from test1 import * # from kdData import * # for 2nd test # print(FeatureList) # a list features # print(Data) # a list of dictionaries. One dictionary for each data sample. # GLOBALS # SortingFeature - what feature to sort on Infinity = 1E+100 # bigger than maximum distance def compare(dict) : global SortingFeature return dict[SortingFeature] # DISTANCE MEASURES # # This assumes that dist(a, b) >= abs(a[f]-b[f]) for all features f def distMax(a, b) : sum = 0 for featureNum in range(0, len(FeatureList)-1) : sum = max(sum, abs(a[FeatureList[featureNum]]-b[FeatureList[featureNum]])) return sum def dist3(a, b) : sum = 0 for featureNum in range(0, len(FeatureList)-1) : sum += abs(a[FeatureList[featureNum]]-b[FeatureList[featureNum]])**3 return sum**(1/3) def dist2(a, b) : sum = 0 for featureNum in range(0, len(FeatureList)-1) : sum += (a[FeatureList[featureNum]]-b[FeatureList[featureNum]])**2 return sqrt(sum) def dist(a, b) : sum = 0 for featureNum in range(0, len(FeatureList)-1) : sum += abs(a[FeatureList[featureNum]]-b[FeatureList[featureNum]]) return sum # BUILD KDTREE # # featureNum is the number of the feature in the variable Features # feature is the feature *name* itself like R, G, or B # # PARENT node is: [feature, value, [child1, child2]] # or: [feature, value, [child1]] # LEAF node is: [feature, value] # def kdBuild(featureNum, data) : global SortingFeature # sort on feature number featureNum SortingFeature = FeatureList[featureNum] # set up the feature to sort on (see compare function) data.sort(key=compare) # what is the next featureNum including wrap around feature = FeatureList[featureNum] # check the amount of data in this part of the tree nextFeatureNum = (featureNum + 1) % (len(FeatureList) - 1) # don't include Ans feature length = len(data) # if only 1 then make it a leaf if length==1 : return [feature, data[0]] # if only 2 then make it parent and leaf elif length==2 : # len(childlist) == 1 i.e. single child return [feature, data[1], [[nextFeatureNum, data[0]]]] # split at *median* of the data in this part of tree and build tree recursively else : # len(childlist) == 2 i.e. double child return [feature, data[length//2], [kdBuild(nextFeatureNum, data[:length//2-1+1]), kdBuild(nextFeatureNum, data[length//2+1:])] ] # using the given kdtree find the item that is closest to item def nearest(kdtree, ourDist, item) : (best, bestex) = nearestAux(kdtree, ourDist, item, Infinity, None) return bestex # best and bestex are passed in and returned as values of the function. # In a C++ version kdtree would be the kdtree Matrix, and best and bestex # could be passed as references as I did in my C++ code making them more like # globals. def nearestAux(kdtree, ourDist, item, best, bestex) : # if this is a leaf if len(kdtree)==2 : # this is a single node of feature + example (feature, splitPt) = kdtree if ourDist(item, splitPt)best : # border too far away? (assume Euclidean dist) return (best, bestex) # SHORT CUT # try parent if ourDist(item, splitPt)best : # border too far away? (assume Euclidean dist) return (best, bestex) # SHORT CUT # try parent if ourDist(item, splitPt)