[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

vigra/sampling.hxx VIGRA

00001 /************************************************************************/
00002 /*                                                                      */
00003 /*        Copyright 2008-2009 by  Ullrich Koethe and Rahul Nair         */
00004 /*                                                                      */
00005 /*    This file is part of the VIGRA computer vision library.           */
00006 /*    The VIGRA Website is                                              */
00007 /*        http://hci.iwr.uni-heidelberg.de/vigra/                       */
00008 /*    Please direct questions, bug reports, and contributions to        */
00009 /*        ullrich.koethe@iwr.uni-heidelberg.de    or                    */
00010 /*        vigra@informatik.uni-hamburg.de                               */
00011 /*                                                                      */
00012 /*    Permission is hereby granted, free of charge, to any person       */
00013 /*    obtaining a copy of this software and associated documentation    */
00014 /*    files (the "Software"), to deal in the Software without           */
00015 /*    restriction, including without limitation the rights to use,      */
00016 /*    copy, modify, merge, publish, distribute, sublicense, and/or      */
00017 /*    sell copies of the Software, and to permit persons to whom the    */
00018 /*    Software is furnished to do so, subject to the following          */
00019 /*    conditions:                                                       */
00020 /*                                                                      */
00021 /*    The above copyright notice and this permission notice shall be    */
00022 /*    included in all copies or substantial portions of the             */
00023 /*    Software.                                                         */
00024 /*                                                                      */
00025 /*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
00026 /*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
00027 /*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
00028 /*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
00029 /*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
00030 /*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
00031 /*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
00032 /*    OTHER DEALINGS IN THE SOFTWARE.                                   */
00033 /*                                                                      */
00034 /************************************************************************/
00035 
00036 #ifndef VIGRA_SAMPLING_HXX
00037 #define VIGRA_SAMPLING_HXX
00038 
00039 #include "array_vector.hxx"
00040 #include "random.hxx"
00041 #include <map>
00042 #include <memory>
00043 #include <cmath>
00044 
00045 namespace vigra
00046 {
00047 
00048 /** \addtogroup MachineLearning Machine Learning
00049 **/
00050 //@{
00051 
00052 
00053 /**\brief Options object for the Sampler class.
00054  
00055   <b>usage:</b>
00056  
00057   \code
00058   SamplerOptions opt =  SamplerOptions()
00059                                .withReplacement()
00060                                .sampleProportion(0.5);
00061   \endcode
00062  
00063   Note that the return value of all methods is <tt>*this</tt> which makes
00064   concatenating of options as above possible.
00065 */
00066 class SamplerOptions
00067 {
00068   public:
00069 
00070     double sample_proportion;
00071     unsigned int sample_size;
00072     bool   sample_with_replacement;
00073     bool   stratified_sampling;
00074     
00075     SamplerOptions()
00076     : sample_proportion(1.0),
00077       sample_size(0), 
00078       sample_with_replacement(true),
00079       stratified_sampling(false)
00080     {}
00081 
00082         /**\brief Sample from training population with replacement.
00083          *
00084          * <br> Default: true
00085          */
00086     SamplerOptions& withReplacement(bool in = true)
00087     {
00088         sample_with_replacement = in;
00089         return *this;
00090     }
00091 
00092         /**\brief Sample from training population without replacement.
00093          *
00094          * <br> Default (if you don't call this function): false
00095          */
00096     SamplerOptions& withoutReplacement(bool in = true)
00097     {
00098         sample_with_replacement = !in;
00099         return *this;
00100     }
00101 
00102         /**\brief Draw the given number of samples.
00103          * If stratifiedSampling is true, the \a size is equally distributed
00104          * across all strata (e.g. <tt>size / strataCount</tt> samples are taken 
00105          * from each stratum, subject to rounding).
00106          *
00107          * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
00108          */
00109     SamplerOptions& sampleSize(unsigned int size)
00110     {
00111         sample_size = size;
00112         return *this;
00113     }
00114 
00115 
00116         /**\brief Determine the number of samples to draw as a proportion of the total
00117          * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 
00118          * This option is overridden when an absolute count is specified by sampleSize().
00119          * 
00120          * If stratifiedSampling is true, the count is equally distributed
00121          * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 
00122          * from each stratum).
00123          *
00124          * <br> Default: 1.0
00125          */
00126     SamplerOptions& sampleProportion(double proportion)
00127     {
00128         vigra_precondition(proportion >= 0.0,
00129                "SamplerOptions::sampleProportion(): argument must not be negative.");
00130         sample_proportion = proportion;
00131         return *this;
00132     }
00133 
00134         /**\brief Draw equally many samples from each "stratum". 
00135          *  A stratum is a group of like entities, e.g. pixels belonging 
00136          *  to the same object class. This is useful to create balanced samples  
00137          *  when the class probabilities are very unbalanced (e.g.
00138          *  when there are many background and few foreground pixels).
00139          *  Stratified sampling thus avoids that a trained classifier is biased 
00140          *  towards the majority class. 
00141          *
00142          * <br> Default (if you don't call this function): false
00143          */
00144     SamplerOptions& stratified(bool in = true)
00145     {
00146         stratified_sampling = in;
00147         return *this;
00148     }
00149 };
00150 
00151 /************************************************************/
00152 /*                                                          */
00153 /*                        Sampler                           */
00154 /*                                                          */
00155 /************************************************************/
00156 
00157 /** \brief Create random samples from a sequence of indices.
00158 
00159     Selecting data items at random is a basic task of machine learning,
00160     for example in boostrapping, RandomForest training, and cross validation.
00161     This class implements various ways to select random samples via their indices. 
00162     Indices are assumed to be consecutive in
00163     the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
00164     
00165     The class always contains a current sample which can be accessed by 
00166     the index operator or by the function sampledIndices(). The indices
00167     that are not in the current sample (out-of-bag indices) can be accessed
00168     via the function oobIndices().
00169     
00170     The sampling method (with/without replacement, stratified or not) and the
00171     number of samples to draw are determined by the option object 
00172     SamplerOptions.
00173     
00174     <b>Usage:</b>
00175     
00176     <b>\#include</b> <vigra/sampling.hxx><br>
00177     Namespace: vigra
00178     
00179     Create a Sampler with default options, i.e. sample as many indices as there 
00180     are data elements, with replacement. On average, the sample will contain 
00181     <tt>0.63*totalCount</tt> distinct indices.
00182     
00183     \code
00184     int totalCount = 10000;   // total number of data elements
00185     int numberOfSamples = 20; // repeat experiment 20 times 
00186     Sampler<> sampler(totalCount);
00187     for(int k=0; k<numberOfSamples; ++k)
00188     {
00189         // process current sample
00190         for(int i=0; i<sampler.sampleSize(); ++i)
00191         {
00192             int currentIndex = sampler[i];
00193             processData(data[currentIndex]);
00194         }
00195         // create next sample
00196         sampler.sample();
00197     }
00198     \endcode
00199     
00200     Create a Sampler for stratified sampling, without replacement.
00201     
00202     \code
00203     // prepare the strata (i.e. specify which stratum each element belongs to)
00204     int stratumSize1 = 2000, stratumSize2 = 8000,
00205         totalCount = stratumSize1 + stratumSize2;
00206     ArrayVerctor<int> strata(totalCount);
00207     for(int i=0; i<stratumSize1; ++i)
00208         strata[i] = 1;
00209     for(int i=stratumSize1; i<stratumSize2; ++i)
00210         strata[i] = 2;
00211         
00212     int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
00213     int numberOfSamples = 20; // repeat experiment 20 times 
00214     Sampler<> stratifiedSampler(strata.begin(), strata.end(),
00215                      SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
00216 
00217     for(int k=0; k<numberOfSamples; ++k)
00218     {
00219         // process current sample
00220         for(int i=0; i<sampler.sampleSize(); ++i)
00221         {
00222             int currentIndex = sampler[i];
00223             processData(data[currentIndex]);
00224         }
00225         // create next sample
00226         sampler.sample();
00227     }
00228     \endcode
00229 */
00230 template<class Random = MersenneTwister >
00231 class Sampler
00232 {
00233   public:
00234         /** Internal type of the indices.
00235             Currently, 64-bit indices are not supported because this
00236             requires extension of the random number generator classes.
00237         */
00238     typedef Int32                               IndexType;
00239     
00240     typedef ArrayVector     <IndexType>  IndexArrayType;
00241     
00242         /** Type of the array view object that is returned by 
00243             sampledIndices() and oobIndices().
00244         */
00245     typedef ArrayVectorView <IndexType>  IndexArrayViewType;
00246 
00247   private:
00248     typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
00249     typedef std::map<IndexType, int> StrataSizesType;
00250     typedef ArrayVector     <bool>       IsUsedArrayType;
00251     typedef ArrayVectorView <bool>       IsUsedArrayViewType;
00252     
00253     static const int oobInvalid = -1;
00254 
00255     int total_count_, sample_size_;
00256     mutable int current_oob_count_;
00257     StrataIndicesType     strata_indices_;
00258     StrataSizesType       strata_sample_size_;
00259     IndexArrayType        current_sample_;
00260     mutable IndexArrayType        current_oob_sample_;
00261     IsUsedArrayType       is_used_;
00262     Random const & random_;
00263     SamplerOptions options_;
00264 
00265     void initStrataCount()
00266     {
00267         // compute how many samples to take from each stratum
00268         // (may be unequal if sample_size_ is not a multiple of strataCount())
00269         int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
00270         int strata_total_count = strata_sample_size * strataCount();
00271 
00272         for(StrataIndicesType::iterator i = strata_indices_.begin(); 
00273              i != strata_indices_.end(); ++i)
00274         {
00275             if(strata_total_count > sample_size_)
00276             {
00277                 strata_sample_size_[i->first] = strata_sample_size - 1;
00278                 --strata_total_count;
00279             }
00280             else
00281             {
00282                 strata_sample_size_[i->first] = strata_sample_size;
00283             }
00284         }
00285     }
00286 
00287   public:
00288     
00289         /** Create a sampler for \a totalCount data objects.
00290             
00291             In each invocation of <tt>sample()</tt> below, it will sample
00292             indices according to the options passed. If no options are given, 
00293             <tt>totalCount</tt> indices will be drawn with replacement.
00294         */
00295     Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 
00296             Random const & rnd = Random(RandomSeed))
00297     : total_count_(totalCount),
00298       sample_size_(opt.sample_size == 0
00299                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00300                          : opt.sample_size),
00301       current_oob_count_(oobInvalid),
00302       current_sample_(sample_size_),
00303       current_oob_sample_(total_count_),
00304       is_used_(total_count_),
00305       random_(rnd),
00306       options_(opt)
00307     {
00308         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00309           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00310           
00311         vigra_precondition(!opt.stratified_sampling,
00312           "Sampler(): Stratified sampling requested, but no strata given.");
00313           
00314         // initialize a single stratum containing all data
00315         strata_indices_[0].resize(total_count_);
00316         for(int i=0; i<total_count_; ++i)
00317             strata_indices_[0][i] = i;
00318 
00319         initStrataCount();
00320         //this is screwing up the random forest tests.
00321         //sample();
00322     }
00323     
00324         /** Create a sampler for stratified sampling.
00325             
00326             <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 
00327             which specifies for each sample the stratum it belongs to. The
00328             total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
00329             Equally many samples (subject to rounding) will be drawn from each stratum, 
00330             unless the option object explicitly requests unstratified sampling, 
00331             in which case the strata are ignored.
00332         */
00333     template <class Iterator>
00334     Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 
00335             Random const & rnd = Random(RandomSeed))
00336     : total_count_(strataEnd - strataBegin),
00337       sample_size_(opt.sample_size == 0
00338                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00339                          : opt.sample_size),
00340       current_oob_count_(oobInvalid),
00341       current_sample_(sample_size_),
00342       current_oob_sample_(total_count_),
00343       is_used_(total_count_),
00344       random_(rnd),
00345       options_(opt)
00346     {
00347         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00348           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00349           
00350         // copy the strata indices
00351         if(opt.stratified_sampling)
00352         {
00353             for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
00354             {
00355                 strata_indices_[*strataBegin].push_back(i);
00356             }
00357         }
00358         else
00359         {
00360             strata_indices_[0].resize(total_count_);
00361             for(int i=0; i<total_count_; ++i)
00362                 strata_indices_[0][i] = i;
00363         }
00364             
00365         vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
00366             "Sampler(): Requested sample count must be at least as large as the number of strata.");
00367 
00368         initStrataCount();
00369         //this is screwing up the random forest tests.
00370         //sample();
00371     }
00372 
00373         /** Return the k-th index in the current sample.
00374          */
00375     IndexType operator[](int k) const
00376     {
00377         return current_sample_[k];
00378     }
00379 
00380         /** Create a new sample.
00381          */
00382     void sample();
00383 
00384         /** The total number of data elements.
00385          */
00386     int totalCount() const
00387     {
00388         return total_count_;
00389     }
00390 
00391         /** The number of data elements that have been sampled.
00392          */
00393     int sampleSize() const
00394     {
00395         return sample_size_;
00396     }
00397 
00398         /** Same as sampleSize().
00399          */
00400     int size() const
00401     {
00402         return sample_size_;
00403     }
00404 
00405         /** The number of strata to be used.
00406             Will be 1 if no strata are given. Will be ignored when
00407             stratifiedSampling() is false.
00408          */
00409     int strataCount() const
00410     {
00411         return strata_indices_.size();
00412     }
00413 
00414         /** Whether to use stratified sampling.
00415             (If this is false, strata will be ignored even if present.)
00416          */
00417     bool stratifiedSampling() const
00418     {
00419         return options_.stratified_sampling;
00420     }
00421     
00422         /** Whether sampling should be performed with replacement.
00423          */
00424     bool withReplacement() const
00425     {
00426         return options_.sample_with_replacement;
00427     }
00428     
00429         /** Return an array view containing the indices in the current sample.
00430          */
00431     IndexArrayViewType sampledIndices() const
00432     {
00433         return current_sample_;
00434     }
00435     
00436         /** Return an array view containing the out-of-bag indices.
00437             (i.e. the indices that are not in the current sample)
00438          */
00439     IndexArrayViewType oobIndices() const
00440     {
00441         if(current_oob_count_ == oobInvalid)
00442         {
00443             current_oob_count_ = 0;
00444             for(int i = 0; i<total_count_; ++i)
00445             {
00446                 if(!is_used_[i])
00447                 {
00448                     current_oob_sample_[current_oob_count_] = i;
00449                     ++current_oob_count_;
00450                 }
00451             }
00452         }
00453         return current_oob_sample_.subarray(0, current_oob_count_);
00454     }
00455     IsUsedArrayType const & is_used() const
00456     {
00457         return is_used_;
00458     }
00459 };
00460 
00461 
00462 template<class Random>
00463 void Sampler<Random>::sample()
00464 {
00465     current_oob_count_ = oobInvalid;
00466     is_used_.init(false);
00467     
00468     if(options_.sample_with_replacement)
00469     {
00470         //Go thru all strata
00471         int j = 0;
00472         StrataIndicesType::iterator iter;
00473         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00474         {
00475             // do sampling with replacement in each strata and copy data.
00476             int stratum_size = iter->second.size();
00477             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00478             {
00479                 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
00480                 is_used_[current_sample_[j]] = true;
00481             }
00482         }
00483     }
00484     else
00485     {
00486         //Go thru all strata
00487         int j = 0;
00488         StrataIndicesType::iterator iter;
00489         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00490         {
00491             // do sampling without replacement in each strata and copy data.
00492             int stratum_size = iter->second.size();
00493             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00494             {
00495                 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
00496                 current_sample_[j] = iter->second[i];
00497                 is_used_[current_sample_[j]] = true;
00498             }
00499         }
00500     }
00501 }
00502 
00503 template<class Random =RandomTT800 >
00504 class PoissonSampler
00505 {
00506 public:
00507     Random  randfloat;
00508     typedef Int32                               IndexType;
00509     typedef vigra::ArrayVector     <IndexType>  IndexArrayType;
00510     IndexArrayType        used_indices_;
00511     double lambda;
00512     int minIndex;
00513     int maxIndex;
00514     
00515     PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
00516     : lambda(lambda),
00517       minIndex(minIndex),
00518       maxIndex(maxIndex)
00519     {}
00520 
00521     void sample(  )
00522     {
00523         used_indices_.clear();
00524         IndexType i;
00525         for(i=minIndex;i<maxIndex;++i)
00526         {
00527             //from http://en.wikipedia.org/wiki/Poisson_distribution
00528             int k=0;
00529             double p=1;
00530             double L=exp(-lambda);
00531             do
00532             {
00533                 ++k;
00534                 p*=randfloat.uniform53();
00535 
00536             }while(p>L);
00537             --k;
00538             //Insert i this many time
00539             while(k>0)
00540             {
00541                 used_indices_.push_back(i);
00542                 --k;
00543             }
00544         }
00545     }
00546 
00547     IndexType const & operator[](int in) const
00548     {
00549         return used_indices_[in];
00550     }
00551     
00552     int numOfSamples() const
00553     {
00554         return used_indices_.size();
00555     }
00556 };
00557 
00558 //@}
00559 
00560 } // namespace vigra
00561 
00562 #endif /*VIGRA_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.8.0 (20 Sep 2011)