1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 __all__ = ['UAC','thresh_1d','thresh_normal_1d','sort_1d','histogram_1d']
26
27 from math import ceil
28 from numpy import zeros, arange
29
30 from pycv.cs.ml import Predictor, Dataset
31 from pycv.cs.dm import argsort, argsort_and_cut
32
33 from pycv.ext import thresh1d_sdSolveUAC, thresh1d_histogram_1d, \
34 thresh1d_histogram_1d_w, thresh1d_sdTSolve, thresh1d_sdTSolve_w, \
35 thresh1d_sdGTSolve
36
37
38
40 bparam = zeros(2)
41 bsol = zeros(2,'int')
42
43 thresh1d_sdSolveUAC(s, sid, y, nspc, x, bid, B, minF0, bnd2, typ, bparam, bsol)
44
45 return bparam, bsol
46
47
49 """Univariate Additive Classifier
50
51 A classifier of the form:
52 Classify x as class y = sgn( s(x) + c sgn(x-b) )
53 where s(x) is a score function, (c,b) are unknown parameters.
54 Suppose there are N training samples (x_n, y_n, s(x_n)).
55 Let:
56 + B(b) = (sgn(x_1-b), sgn(x_2-b), ..., sgn(x_N-b))
57 + F(c,b) = (sgn(s(x_1) + c sgn(x_1-b)), ..., sgn(s(x_N) + c sgn(x_N-b)))
58 When training with N samples (x_n, y_n, F(x_n)), it is provable that
59 + R can be partitioned into at most N+1 intervals such that in any
60 interval, B(b) is fixed for different values of b
61 + Given b is known, R can be partitioned into at most N+1 intervals such
62 that in any interval, F(c,b) is fixed for different values of c
63 + Given b is fixed but unknown, R can be partitioned into at most 2N+1
64 intervals such that in any interval, F(c,b) is fixed for different
65 values of c
66 s(x) must not be 0 for all x_n
67 """
69 """Initialize the class.
70
71 Input:
72 s, y: see the class' docstring.
73 """
74 if len(s) != len(y):
75 raise ValueError("Size of s is not equal size of y")
76 self.N = len(s)
77
78 self.s = s
79 self.y = y
80
81
82 self.sid = argsort(abs(s))
83
84
85 self.nspc = zeros(2,'int')
86 self.nspc[1] = (y != 0).sum()
87 self.nspc[0] = self.N - self.nspc[1]
88
89
90 self.minF0 = zeros(2,'int')
91 self.minF0[0] = ((y != 0)&(s < 0)).sum()
92 self.minF0[1] = ((y == 0)&(s > 0)).sum()
93
99
100
102 """Get the current best solution.
103
104 Input:
105 typ: = 0 if to constrain FRR <= maxFRR and minimize FAR < maxFAR
106 = 1 if to constrain FAR <= maxFAR and minimize FRR < maxFRR
107 bnd: a numpy.array of 2 doubles: maxFRR and maxFAR
108 Output:
109 bparam: best (b,c)
110 bsol: best (FRR,FAR)
111 """
112 typ = int(typ)
113 bnd2 = zeros(2,'int')
114 bnd2[typ] = int(bnd[typ]*self.nspc[1-typ])
115 bnd2[1-typ] = int(ceil(bnd[1-typ]*self.nspc[typ]))
116
117 bparam, bsol = _sdSolveUAC(self.s, self.sid, self.y, self.nspc, self.x, \
118 self.bid, self.B, self.minF0,bnd2,typ)
119
120 bsol2 = zeros(2)
121 bsol2[0] = float(bsol[0])/self.nspc[1]
122 bsol2[1] = float(bsol[1])/self.nspc[0]
123
124 return bparam, bsol2
125
126 - def solve(self,typ,bnd):
127 """If typ == 0, constrain FRR <= bnd and minimize FAR. Otherwise constrain FAR <= bnd and minimize FRR.
128
129 Input:
130 typ: type of optimization, minimize FAR if typ == 0. Otherwise minimize FRR.
131 bnd: upper bound to the other, in [0,1].
132 """
133 bnd2 = zeros(2,'int')
134 if typ == 0:
135 bnd2[0] = int(round(bnd*self.nspc[1]))
136 bnd2[1] = self.N+1
137 else:
138 bnd2[0] = self.N+1
139 bnd2[1] = int(round(bnd*self.nspc[0]))
140
141 bparam, bsol = _sdSolveUAC(self.s, self.sid, self.y, self.nspc, self.x, \
142 self.bid, self.B, self.minF0,bnd2,typ)
143
144 bsol2 = zeros(2)
145 bsol2[0] = float(bsol[0])/self.nspc[1]
146 bsol2[1] = float(bsol[1])/self.nspc[0]
147
148 return bparam, bsol2
149
150 - def get(self,crit,param1):
151 """Find b, c such that:
152 If crit = 0: to minimize thelambda*FRR(b,c)+FAR(b,c),
153 where thelambda = param1
154 If crit = 1: to minimize FAR(b,c) subject to FRR(b,c) <= 1-minDR,
155 where minDR = param1
156 If crit = 2: to minimize FRR(b,c) subject to FAR(b,c) <= maxFAR,
157 where maxFAR = param1
158 Return:
159 bparam: (b,c)
160 bsol: (FRR,FAR)
161 """
162 if crit == 0:
163
164 bparam, bsol = self.solve(0,0)
165 maxFAR = bsol[1]
166
167
168 bparam, bsol = self.solve(1,0)
169 maxFRR = bsol[0]
170
171 if(maxFAR < maxFRR):
172 def func(x):
173 bparam, bsol = self.solve(0,x)
174
175 return param1*x + bsol[1]
176 xmin = fminbound(func,0,maxFRR,xtol=1.0/self.N)
177 return self.solve(0,xmin)
178
179 else:
180 def func(x):
181 bparam, bsol = self.solve(1,x)
182
183 return param1*bsol[0] + x
184 xmin = fminbound(func,0,maxFAR,xtol=1.0/self.N)
185 return self.solve(1,xmin)
186
187 return bparam, bsol
188
189 if crit == 1:
190 return self.solve(0,1-param1)
191
192 if crit == 2:
193 return self.solve(1,param1)
194
195 raise ValueError("crit is not 0, 1, nor 2")
196
197
198
199
200 -def histogram_1d(wcd, nbins, minValue = None, maxValue = None):
201 """Compute a histogram from a (weighted) classification dataset with ishape=()
202
203 :Parameters:
204 wcd : WeightedCDataset
205 a (weighted) dataset of J classes and ishape ()
206
207 nbins : integer
208 the number of bins
209
210 minValue : double
211 minimum value of the view, default is the smallest value in the dataset
212
213 maxValue : double
214 maximum value of the view, default is the largest value in the dataset
215
216 :Returns:
217 hist : array(shape=(J,nbins), dtype='double')
218 J histograms of J classes
219
220 bin_interval : array(shape=(nbins,2),dtype='double')
221 array of nbins bin intervals
222
223 """
224
225
226 nbins = int(nbins)
227 bin_interval = zeros((nbins,2))
228
229 if minValue is None:
230 minValue = min([x.min() for x in wcd.input_data]) - 1E-10
231 minValue = float(minValue)
232
233 if maxValue is None:
234 maxValue = max([x.max() for x in wcd.input_data]) + 1E-10
235 maxValue = float(maxValue)
236
237 resolution = (maxValue - minValue) / nbins
238 bin_interval[:,0] = arange(nbins) * resolution + minValue
239 bin_interval[:,1] = bin_interval[:,0] + resolution
240
241
242 hist = zeros((wcd.J,nbins))
243 if wcd.weights is None:
244 for j in xrange(wcd.J):
245 input_array = wcd.input_data[j]
246 hist_array = hist[j]
247
248 thresh1d_histogram_1d(input_array, minValue, maxValue, nbins,
249 hist_array)
250 else:
251 for j in xrange(wcd.J):
252 input_array = wcd.input_data[j]
253 hist_array = hist[j]
254 weights = wcd.weights[j]
255
256 thresh1d_histogram_1d_w(input_array, minValue, maxValue, nbins,
257 hist_array, weights)
258
259 return hist, bin_interval
260
261
263 """Take a classification dataset with ishape=(), sort them in ascending order.
264
265 :Parameters:
266 cd : CDataset
267 a dataset of J classes and ishape ()
268
269 :Returns:
270 sorted_id : array(shape=(cd.N,2),'int')
271 each tuple (j,index) represents an input value, which is identified
272 by its class 'j' and its index 'index' in the class
273 """
274 val = zeros(cd.N)
275 vj = zeros((cd.N,2),'int')
276 i2 = 0
277 for j in xrange(cd.J):
278 i1 = i2
279 i2 += cd.nspc[j]
280 val[i1:i2] = cd.input_data[j]
281 vj[i1:i2,0] = j
282 vj[i1:i2,1] = arange(cd.nspc[j])
283
284 return vj[argsort(val)]
285
286 -def thresh_1d(criterion,param1,wcd,sort_id = None):
287 """Solve threshold-based 1D binary classifier.
288
289 The function solves the following problem:
290 Given two sets of samples of two classes, a positive one and a negative one,
291 a threshold-based classifier classifies a value x into a positive or a negative
292 class: sign(x - \theta). The optimal \theta is chosen based on different criteria:
293 - Minimize the classification error: \lambda * p(pos)*FRR + p(neg)*FAR
294 - Minimize the error without prior: \lambda * FRR + FAR
295 - Minimize FAR with constraint FRR <= maxFRR
296 - Minimize FRR with constraint FAR <= maxFAR
297
298 :Parameters:
299 criterion : integer from 0 to 3
300 0: minimize classification error with prior probabilities
301 1: minimize classification error without prior probabilities
302 2: minimize FAR while constraining FRR
303 3: minimize FRR while constraining FAR
304 param1 : double
305 a parameter representing
306 \lambda if criterion < 2
307 maxFRR if criterion == 2
308 maxFAR if criterion == 3
309 wcd : WeightedCDataset(J=2,ishape=()) (in cs.ml.cla package)
310 a dataset of sample values of the two classes
311 sort_id : array
312 the result of calling sort_1d(wcd),
313 if sort_id is None, sort_1d(wcd) is called
314
315
316 :Returns:
317 result : array(shape=(2,),dtype='d')
318 an argout array representing
319 - result[0]: the threshold
320 - result[1]: the optimized function value at that threshold
321 """
322 criterion = int(criterion)
323 param1 = float(param1)
324 idata0 = wcd.input_data[0]
325 idata1 = wcd.input_data[1]
326 if sort_id is None:
327 sort_id = sort_1d(wcd)
328 result = zeros(2)
329
330 if wcd.weights is None:
331 thresh1d_sdTSolve(criterion, param1, idata0, idata1, sort_id, result)
332 else:
333 w0 = wcd.weights[0]
334 w1 = wcd.weights[1]
335 thresh1d_sdTSolve_w(criterion, param1, idata0, idata1, sort_id, w0, w1, result)
336
337 return result
338
339
341 """
342 Solve the class-conditional Gaussian-assumed classification with thresholding and goal.
343
344 This set of functions solve the following problem:
345 Given two classes normally distributed, a positive one and a negative one,
346 a threshold-based classifier classifies a value x into a positive or a negative
347 class: sign(x - \theta). The optimal \theta is chosen based on different criteria:
348 - Minimize the classification error: \lambda * p(pos)*FRR + p(neg)*FAR
349 - Minimize the error without prior: \lambda * FRR + FAR
350 - Minimize FAR with constraint FRR <= maxFRR
351 - Minimize FRR with constraint FAR <= maxFAR
352
353 :Parameters:
354 criterion : integer from 0 to 3
355 0: minimize classification error with prior probabilities
356 1: minimize classification error without prior probabilities
357 2: minimize FAR while constraining FRR
358 3: minimize FRR while constraining FAR
359 param1 : double
360 a parameter representing
361 \lambda if criterion < 2
362 maxFRR if criterion == 2
363 maxFAR if criterion == 3
364 stats : Stats2e(J=2,d=1)
365 REQUIREMENT: mean of class 0 <= mean of class 1
366
367 :Returns:
368 result : array(shape=(2,),dtype='d')
369 an argout array representing
370 - result[0]: the threshold
371 - result[1]: the optimized function value at that threshold
372 """
373 criterion = int(criterion)
374 param1 = float(param1)
375 st = stats.A.ravel()
376 result = zeros(2)
377 thresh1d_sdGTSolve(criterion, param1, st, result)
378
379 return result
380