[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]
vigra/sampling.hxx | ![]() |
00001 /************************************************************************/ 00002 /* */ 00003 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */ 00004 /* */ 00005 /* This file is part of the VIGRA computer vision library. */ 00006 /* The VIGRA Website is */ 00007 /* http://hci.iwr.uni-heidelberg.de/vigra/ */ 00008 /* Please direct questions, bug reports, and contributions to */ 00009 /* ullrich.koethe@iwr.uni-heidelberg.de or */ 00010 /* vigra@informatik.uni-hamburg.de */ 00011 /* */ 00012 /* Permission is hereby granted, free of charge, to any person */ 00013 /* obtaining a copy of this software and associated documentation */ 00014 /* files (the "Software"), to deal in the Software without */ 00015 /* restriction, including without limitation the rights to use, */ 00016 /* copy, modify, merge, publish, distribute, sublicense, and/or */ 00017 /* sell copies of the Software, and to permit persons to whom the */ 00018 /* Software is furnished to do so, subject to the following */ 00019 /* conditions: */ 00020 /* */ 00021 /* The above copyright notice and this permission notice shall be */ 00022 /* included in all copies or substantial portions of the */ 00023 /* Software. */ 00024 /* */ 00025 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ 00026 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ 00027 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 00028 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ 00029 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ 00030 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ 00031 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ 00032 /* OTHER DEALINGS IN THE SOFTWARE. */ 00033 /* */ 00034 /************************************************************************/ 00035 00036 #ifndef VIGRA_SAMPLING_HXX 00037 #define VIGRA_SAMPLING_HXX 00038 00039 #include "array_vector.hxx" 00040 #include "random.hxx" 00041 #include <map> 00042 #include <memory> 00043 #include <cmath> 00044 00045 namespace vigra 00046 { 00047 00048 /** \addtogroup MachineLearning Machine Learning 00049 **/ 00050 //@{ 00051 00052 00053 /**\brief Options object for the Sampler class. 00054 00055 <b>usage:</b> 00056 00057 \code 00058 SamplerOptions opt = SamplerOptions() 00059 .withReplacement() 00060 .sampleProportion(0.5); 00061 \endcode 00062 00063 Note that the return value of all methods is <tt>*this</tt> which makes 00064 concatenating of options as above possible. 00065 */ 00066 class SamplerOptions 00067 { 00068 public: 00069 00070 double sample_proportion; 00071 unsigned int sample_size; 00072 bool sample_with_replacement; 00073 bool stratified_sampling; 00074 00075 SamplerOptions() 00076 : sample_proportion(1.0), 00077 sample_size(0), 00078 sample_with_replacement(true), 00079 stratified_sampling(false) 00080 {} 00081 00082 /**\brief Sample from training population with replacement. 00083 * 00084 * <br> Default: true 00085 */ 00086 SamplerOptions& withReplacement(bool in = true) 00087 { 00088 sample_with_replacement = in; 00089 return *this; 00090 } 00091 00092 /**\brief Sample from training population without replacement. 00093 * 00094 * <br> Default (if you don't call this function): false 00095 */ 00096 SamplerOptions& withoutReplacement(bool in = true) 00097 { 00098 sample_with_replacement = !in; 00099 return *this; 00100 } 00101 00102 /**\brief Draw the given number of samples. 00103 * If stratifiedSampling is true, the \a size is equally distributed 00104 * across all strata (e.g. <tt>size / strataCount</tt> samples are taken 00105 * from each stratum, subject to rounding). 00106 * 00107 * <br> Default: 0 (i.e. determine the count by means of sampleProportion()) 00108 */ 00109 SamplerOptions& sampleSize(unsigned int size) 00110 { 00111 sample_size = size; 00112 return *this; 00113 } 00114 00115 00116 /**\brief Determine the number of samples to draw as a proportion of the total 00117 * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 00118 * This option is overridden when an absolute count is specified by sampleSize(). 00119 * 00120 * If stratifiedSampling is true, the count is equally distributed 00121 * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 00122 * from each stratum). 00123 * 00124 * <br> Default: 1.0 00125 */ 00126 SamplerOptions& sampleProportion(double proportion) 00127 { 00128 vigra_precondition(proportion >= 0.0, 00129 "SamplerOptions::sampleProportion(): argument must not be negative."); 00130 sample_proportion = proportion; 00131 return *this; 00132 } 00133 00134 /**\brief Draw equally many samples from each "stratum". 00135 * A stratum is a group of like entities, e.g. pixels belonging 00136 * to the same object class. This is useful to create balanced samples 00137 * when the class probabilities are very unbalanced (e.g. 00138 * when there are many background and few foreground pixels). 00139 * Stratified sampling thus avoids that a trained classifier is biased 00140 * towards the majority class. 00141 * 00142 * <br> Default (if you don't call this function): false 00143 */ 00144 SamplerOptions& stratified(bool in = true) 00145 { 00146 stratified_sampling = in; 00147 return *this; 00148 } 00149 }; 00150 00151 /************************************************************/ 00152 /* */ 00153 /* Sampler */ 00154 /* */ 00155 /************************************************************/ 00156 00157 /** \brief Create random samples from a sequence of indices. 00158 00159 Selecting data items at random is a basic task of machine learning, 00160 for example in boostrapping, RandomForest training, and cross validation. 00161 This class implements various ways to select random samples via their indices. 00162 Indices are assumed to be consecutive in 00163 the range <tt>0 <= index < total_sample_count</tt>. 00164 00165 The class always contains a current sample which can be accessed by 00166 the index operator or by the function sampledIndices(). The indices 00167 that are not in the current sample (out-of-bag indices) can be accessed 00168 via the function oobIndices(). 00169 00170 The sampling method (with/without replacement, stratified or not) and the 00171 number of samples to draw are determined by the option object 00172 SamplerOptions. 00173 00174 <b>Usage:</b> 00175 00176 <b>\#include</b> <vigra/sampling.hxx><br> 00177 Namespace: vigra 00178 00179 Create a Sampler with default options, i.e. sample as many indices as there 00180 are data elements, with replacement. On average, the sample will contain 00181 <tt>0.63*totalCount</tt> distinct indices. 00182 00183 \code 00184 int totalCount = 10000; // total number of data elements 00185 int numberOfSamples = 20; // repeat experiment 20 times 00186 Sampler<> sampler(totalCount); 00187 for(int k=0; k<numberOfSamples; ++k) 00188 { 00189 // process current sample 00190 for(int i=0; i<sampler.sampleSize(); ++i) 00191 { 00192 int currentIndex = sampler[i]; 00193 processData(data[currentIndex]); 00194 } 00195 // create next sample 00196 sampler.sample(); 00197 } 00198 \endcode 00199 00200 Create a Sampler for stratified sampling, without replacement. 00201 00202 \code 00203 // prepare the strata (i.e. specify which stratum each element belongs to) 00204 int stratumSize1 = 2000, stratumSize2 = 8000, 00205 totalCount = stratumSize1 + stratumSize2; 00206 ArrayVerctor<int> strata(totalCount); 00207 for(int i=0; i<stratumSize1; ++i) 00208 strata[i] = 1; 00209 for(int i=stratumSize1; i<stratumSize2; ++i) 00210 strata[i] = 2; 00211 00212 int sampleSize = 200; // i.e. sample 100 elements from each of the two strata 00213 int numberOfSamples = 20; // repeat experiment 20 times 00214 Sampler<> stratifiedSampler(strata.begin(), strata.end(), 00215 SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize)); 00216 00217 for(int k=0; k<numberOfSamples; ++k) 00218 { 00219 // process current sample 00220 for(int i=0; i<sampler.sampleSize(); ++i) 00221 { 00222 int currentIndex = sampler[i]; 00223 processData(data[currentIndex]); 00224 } 00225 // create next sample 00226 sampler.sample(); 00227 } 00228 \endcode 00229 */ 00230 template<class Random = MersenneTwister > 00231 class Sampler 00232 { 00233 public: 00234 /** Internal type of the indices. 00235 Currently, 64-bit indices are not supported because this 00236 requires extension of the random number generator classes. 00237 */ 00238 typedef Int32 IndexType; 00239 00240 typedef ArrayVector <IndexType> IndexArrayType; 00241 00242 /** Type of the array view object that is returned by 00243 sampledIndices() and oobIndices(). 00244 */ 00245 typedef ArrayVectorView <IndexType> IndexArrayViewType; 00246 00247 private: 00248 typedef std::map<IndexType, IndexArrayType> StrataIndicesType; 00249 typedef std::map<IndexType, int> StrataSizesType; 00250 typedef ArrayVector <bool> IsUsedArrayType; 00251 typedef ArrayVectorView <bool> IsUsedArrayViewType; 00252 00253 static const int oobInvalid = -1; 00254 00255 int total_count_, sample_size_; 00256 mutable int current_oob_count_; 00257 StrataIndicesType strata_indices_; 00258 StrataSizesType strata_sample_size_; 00259 IndexArrayType current_sample_; 00260 mutable IndexArrayType current_oob_sample_; 00261 IsUsedArrayType is_used_; 00262 Random const & random_; 00263 SamplerOptions options_; 00264 00265 void initStrataCount() 00266 { 00267 // compute how many samples to take from each stratum 00268 // (may be unequal if sample_size_ is not a multiple of strataCount()) 00269 int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount()); 00270 int strata_total_count = strata_sample_size * strataCount(); 00271 00272 for(StrataIndicesType::iterator i = strata_indices_.begin(); 00273 i != strata_indices_.end(); ++i) 00274 { 00275 if(strata_total_count > sample_size_) 00276 { 00277 strata_sample_size_[i->first] = strata_sample_size - 1; 00278 --strata_total_count; 00279 } 00280 else 00281 { 00282 strata_sample_size_[i->first] = strata_sample_size; 00283 } 00284 } 00285 } 00286 00287 public: 00288 00289 /** Create a sampler for \a totalCount data objects. 00290 00291 In each invocation of <tt>sample()</tt> below, it will sample 00292 indices according to the options passed. If no options are given, 00293 <tt>totalCount</tt> indices will be drawn with replacement. 00294 */ 00295 Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 00296 Random const & rnd = Random(RandomSeed)) 00297 : total_count_(totalCount), 00298 sample_size_(opt.sample_size == 0 00299 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00300 : opt.sample_size), 00301 current_oob_count_(oobInvalid), 00302 current_sample_(sample_size_), 00303 current_oob_sample_(total_count_), 00304 is_used_(total_count_), 00305 random_(rnd), 00306 options_(opt) 00307 { 00308 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00309 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00310 00311 vigra_precondition(!opt.stratified_sampling, 00312 "Sampler(): Stratified sampling requested, but no strata given."); 00313 00314 // initialize a single stratum containing all data 00315 strata_indices_[0].resize(total_count_); 00316 for(int i=0; i<total_count_; ++i) 00317 strata_indices_[0][i] = i; 00318 00319 initStrataCount(); 00320 //this is screwing up the random forest tests. 00321 //sample(); 00322 } 00323 00324 /** Create a sampler for stratified sampling. 00325 00326 <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 00327 which specifies for each sample the stratum it belongs to. The 00328 total number of data objects will be set to <tt>strataEnd - strataBegin</tt>. 00329 Equally many samples (subject to rounding) will be drawn from each stratum, 00330 unless the option object explicitly requests unstratified sampling, 00331 in which case the strata are ignored. 00332 */ 00333 template <class Iterator> 00334 Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 00335 Random const & rnd = Random(RandomSeed)) 00336 : total_count_(strataEnd - strataBegin), 00337 sample_size_(opt.sample_size == 0 00338 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00339 : opt.sample_size), 00340 current_oob_count_(oobInvalid), 00341 current_sample_(sample_size_), 00342 current_oob_sample_(total_count_), 00343 is_used_(total_count_), 00344 random_(rnd), 00345 options_(opt) 00346 { 00347 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00348 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00349 00350 // copy the strata indices 00351 if(opt.stratified_sampling) 00352 { 00353 for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin) 00354 { 00355 strata_indices_[*strataBegin].push_back(i); 00356 } 00357 } 00358 else 00359 { 00360 strata_indices_[0].resize(total_count_); 00361 for(int i=0; i<total_count_; ++i) 00362 strata_indices_[0][i] = i; 00363 } 00364 00365 vigra_precondition(sample_size_ >= (int)strata_indices_.size(), 00366 "Sampler(): Requested sample count must be at least as large as the number of strata."); 00367 00368 initStrataCount(); 00369 //this is screwing up the random forest tests. 00370 //sample(); 00371 } 00372 00373 /** Return the k-th index in the current sample. 00374 */ 00375 IndexType operator[](int k) const 00376 { 00377 return current_sample_[k]; 00378 } 00379 00380 /** Create a new sample. 00381 */ 00382 void sample(); 00383 00384 /** The total number of data elements. 00385 */ 00386 int totalCount() const 00387 { 00388 return total_count_; 00389 } 00390 00391 /** The number of data elements that have been sampled. 00392 */ 00393 int sampleSize() const 00394 { 00395 return sample_size_; 00396 } 00397 00398 /** Same as sampleSize(). 00399 */ 00400 int size() const 00401 { 00402 return sample_size_; 00403 } 00404 00405 /** The number of strata to be used. 00406 Will be 1 if no strata are given. Will be ignored when 00407 stratifiedSampling() is false. 00408 */ 00409 int strataCount() const 00410 { 00411 return strata_indices_.size(); 00412 } 00413 00414 /** Whether to use stratified sampling. 00415 (If this is false, strata will be ignored even if present.) 00416 */ 00417 bool stratifiedSampling() const 00418 { 00419 return options_.stratified_sampling; 00420 } 00421 00422 /** Whether sampling should be performed with replacement. 00423 */ 00424 bool withReplacement() const 00425 { 00426 return options_.sample_with_replacement; 00427 } 00428 00429 /** Return an array view containing the indices in the current sample. 00430 */ 00431 IndexArrayViewType sampledIndices() const 00432 { 00433 return current_sample_; 00434 } 00435 00436 /** Return an array view containing the out-of-bag indices. 00437 (i.e. the indices that are not in the current sample) 00438 */ 00439 IndexArrayViewType oobIndices() const 00440 { 00441 if(current_oob_count_ == oobInvalid) 00442 { 00443 current_oob_count_ = 0; 00444 for(int i = 0; i<total_count_; ++i) 00445 { 00446 if(!is_used_[i]) 00447 { 00448 current_oob_sample_[current_oob_count_] = i; 00449 ++current_oob_count_; 00450 } 00451 } 00452 } 00453 return current_oob_sample_.subarray(0, current_oob_count_); 00454 } 00455 IsUsedArrayType const & is_used() const 00456 { 00457 return is_used_; 00458 } 00459 }; 00460 00461 00462 template<class Random> 00463 void Sampler<Random>::sample() 00464 { 00465 current_oob_count_ = oobInvalid; 00466 is_used_.init(false); 00467 00468 if(options_.sample_with_replacement) 00469 { 00470 //Go thru all strata 00471 int j = 0; 00472 StrataIndicesType::iterator iter; 00473 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00474 { 00475 // do sampling with replacement in each strata and copy data. 00476 int stratum_size = iter->second.size(); 00477 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00478 { 00479 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)]; 00480 is_used_[current_sample_[j]] = true; 00481 } 00482 } 00483 } 00484 else 00485 { 00486 //Go thru all strata 00487 int j = 0; 00488 StrataIndicesType::iterator iter; 00489 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00490 { 00491 // do sampling without replacement in each strata and copy data. 00492 int stratum_size = iter->second.size(); 00493 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00494 { 00495 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]); 00496 current_sample_[j] = iter->second[i]; 00497 is_used_[current_sample_[j]] = true; 00498 } 00499 } 00500 } 00501 } 00502 00503 template<class Random =RandomTT800 > 00504 class PoissonSampler 00505 { 00506 public: 00507 Random randfloat; 00508 typedef Int32 IndexType; 00509 typedef vigra::ArrayVector <IndexType> IndexArrayType; 00510 IndexArrayType used_indices_; 00511 double lambda; 00512 int minIndex; 00513 int maxIndex; 00514 00515 PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex) 00516 : lambda(lambda), 00517 minIndex(minIndex), 00518 maxIndex(maxIndex) 00519 {} 00520 00521 void sample( ) 00522 { 00523 used_indices_.clear(); 00524 IndexType i; 00525 for(i=minIndex;i<maxIndex;++i) 00526 { 00527 //from http://en.wikipedia.org/wiki/Poisson_distribution 00528 int k=0; 00529 double p=1; 00530 double L=exp(-lambda); 00531 do 00532 { 00533 ++k; 00534 p*=randfloat.uniform53(); 00535 00536 }while(p>L); 00537 --k; 00538 //Insert i this many time 00539 while(k>0) 00540 { 00541 used_indices_.push_back(i); 00542 --k; 00543 } 00544 } 00545 } 00546 00547 IndexType const & operator[](int in) const 00548 { 00549 return used_indices_[in]; 00550 } 00551 00552 int numOfSamples() const 00553 { 00554 return used_indices_.size(); 00555 } 00556 }; 00557 00558 //@} 00559 00560 } // namespace vigra 00561 00562 #endif /*VIGRA_SAMPLING_HXX*/
© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de) |
html generated using doxygen and Python
|