[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

vigra/index_sampling.hxx VIGRA

00001 /************************************************************************/
00002 /*                                                                      */
00003 /*        Copyright 2008-2009 by  Ullrich Koethe and Rahul Nair         */
00004 /*                                                                      */
00005 /*    This file is part of the VIGRA computer vision library.           */
00006 /*    The VIGRA Website is                                              */
00007 /*        http://hci.iwr.uni-heidelberg.de/vigra/                       */
00008 /*    Please direct questions, bug reports, and contributions to        */
00009 /*        ullrich.koethe@iwr.uni-heidelberg.de    or                    */
00010 /*        vigra@informatik.uni-hamburg.de                               */
00011 /*                                                                      */
00012 /*    Permission is hereby granted, free of charge, to any person       */
00013 /*    obtaining a copy of this software and associated documentation    */
00014 /*    files (the "Software"), to deal in the Software without           */
00015 /*    restriction, including without limitation the rights to use,      */
00016 /*    copy, modify, merge, publish, distribute, sublicense, and/or      */
00017 /*    sell copies of the Software, and to permit persons to whom the    */
00018 /*    Software is furnished to do so, subject to the following          */
00019 /*    conditions:                                                       */
00020 /*                                                                      */
00021 /*    The above copyright notice and this permission notice shall be    */
00022 /*    included in all copies or substantial portions of the             */
00023 /*    Software.                                                         */
00024 /*                                                                      */
00025 /*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
00026 /*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
00027 /*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
00028 /*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
00029 /*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
00030 /*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
00031 /*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
00032 /*    OTHER DEALINGS IN THE SOFTWARE.                                   */
00033 /*                                                                      */
00034 /************************************************************************/
00035 
00036 #ifndef VIGRA_INDEX_SAMPLING_HXX
00037 #define VIGRA_INDEX_SAMPLING_HXX
00038 
00039 #include "array_vector.hxx"
00040 #include "random.hxx"
00041 #include <map>
00042 #include <memory>
00043 #include <cmath>
00044 
00045 namespace vigra
00046 {
00047 
00048 /** \addtogroup MachineLearning Machine Learning
00049 **/
00050 //@{
00051 
00052 
00053 /**\brief Options object for the Sampler class.
00054  
00055   <b>usage:</b>
00056  
00057   \code
00058   SamplerOptions opt =  SamplerOptions()
00059                                .withReplacement()
00060                                .sampleProportion(0.5);
00061   \endcode
00062  
00063   Note that the return value of all methods is <tt>*this</tt> which makes
00064   concatenating of options as above possible.
00065 */
00066 class SamplerOptions
00067 {
00068   public:
00069 
00070     double sample_proportion;
00071     unsigned int sample_size;
00072     bool   sample_with_replacement;
00073     bool   stratified_sampling;
00074     
00075     SamplerOptions()
00076     : sample_proportion(1.0),
00077       sample_size(0), 
00078       sample_with_replacement(true),
00079       stratified_sampling(false)
00080     {}
00081 
00082         /**\brief Sample from training population with replacement.
00083          *
00084          * <br> Default: true
00085          */
00086     SamplerOptions& withReplacement(bool in = true)
00087     {
00088         sample_with_replacement = in;
00089         return *this;
00090     }
00091 
00092         /**\brief Sample from training population without replacement.
00093          *
00094          * <br> Default (if you don't call this function): false
00095          */
00096     SamplerOptions& withoutReplacement(bool in = true)
00097     {
00098         sample_with_replacement = !in;
00099         return *this;
00100     }
00101 
00102         /**\brief Draw the given number of samples.
00103          * If stratifiedSampling is true, the \a size is equally distributed
00104          * accross all strata (e.g. <tt>size / strataCount</tt> samples are taken 
00105          * from each stratum, subject to rounding).
00106          *
00107          * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
00108          */
00109     SamplerOptions& sampleSize(unsigned int size)
00110     {
00111         sample_size = size;
00112         return *this;
00113     }
00114 
00115 
00116         /**\brief Determine the number of samples to draw as a proportion of the total
00117          * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 
00118          * This option is overridden when an absolute count is specified by sampleSize().
00119          * 
00120          * If stratifiedSampling is true, the count is equally distributed
00121          * accross all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 
00122          * from each stratum).
00123          *
00124          * <br> Default: 1.0
00125          */
00126     SamplerOptions& sampleProportion(double proportion)
00127     {
00128         vigra_precondition(proportion >= 0.0,
00129                "SamplerOptions::sampleProportion(): argument must not be negative.");
00130         sample_proportion = proportion;
00131         return *this;
00132     }
00133 
00134         /**\brief Draw equally many samples from each "stratum". 
00135          *  A stratum is a group of like entities, e.g. pixels belonging 
00136          *  to the same object class. This is useful to create balanced samples  
00137          *  when the class probabilities are very unbalanced (e.g.
00138          *  when there are many background and few foreground pixels).
00139          *  Stratified sampling thus avoids that a trained classifier is biased 
00140          *  towards the majority class. 
00141          *
00142          * <br> Default (if you don't call this function): false
00143          */
00144     SamplerOptions& stratified(bool in = true)
00145     {
00146         stratified_sampling = in;
00147         return *this;
00148     }
00149 };
00150 
00151 /************************************************************/
00152 /*                                                          */
00153 /*                        Sampler                           */
00154 /*                                                          */
00155 /************************************************************/
00156 
00157 /** \brief Create random samples from a sequence of indices.
00158 
00159     Selecting data items at random is a basic task of machine learning,
00160     for example in boostrapping, RandomForest training, and cross validation.
00161     This class implements various ways to select random samples via their indices. 
00162     Indices are assumed to be consecutive in
00163     the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
00164     
00165     The class always contains a current sample which can be accessed by 
00166     the index operator or by the function sampledIndices(). The indices
00167     that are not in the current sample (out-of-bag indices) can be accessed
00168     via the function oobIndices().
00169     
00170     The sampling method (with/without replacement, stratified or not) and the
00171     number of samples to draw are determined by the option object 
00172     SamplerOptions.
00173     
00174     <b>Usage:</b>
00175     
00176     <b>\#include</b> <<a href="index__sampling_8hxx-source.html">vigra/index_sampling.hxx</a>><br>
00177     Namespace: vigra
00178     
00179     Create a Sampler with default options, i.e. sample as many indices as there 
00180     are data elements, with replacement. On average, the sample will contain 
00181     <tt>0.63*totalCount</tt> distinct indices.
00182     
00183     \code
00184     int totalCount = 10000;   // total number of data elements
00185     int numberOfSamples = 20; // repeat experiment 20 times 
00186     Sampler<> sampler(totalCount);
00187     for(int k=0; k<numberOfSamples; ++k)
00188     {
00189         // process current sample
00190         for(int i=0; i<sampler.sampleSize(); ++i)
00191         {
00192             int currentIndex = sampler[i];
00193             processData(data[currentIndex]);
00194         }
00195         // create next sample
00196         sampler.sample();
00197     }
00198     \endcode
00199     
00200     Create a Sampler for stratified sampling, without replacement.
00201     
00202     \code
00203     // prepare the strata (i.e. specify which stratum each element belongs to)
00204     int stratumSize1 = 2000, stratumSize2 = 8000,
00205         totalCount = stratumSize1 + stratumSize2;
00206     ArrayVerctor<int> strata(totalCount);
00207     for(int i=0; i<stratumSize1; ++i)
00208         strata[i] = 1;
00209     for(int i=stratumSize1; i<stratumSize2; ++i)
00210         strata[i] = 2;
00211         
00212     int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
00213     int numberOfSamples = 20; // repeat experiment 20 times 
00214     Sampler<> stratifiedSampler(strata.begin(), strata.end(),
00215                      SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
00216 
00217     for(int k=0; k<numberOfSamples; ++k)
00218     {
00219         // process current sample
00220         for(int i=0; i<sampler.sampleSize(); ++i)
00221         {
00222             int currentIndex = sampler[i];
00223             processData(data[currentIndex]);
00224         }
00225         // create next sample
00226         sampler.sample();
00227     }
00228     \endcode
00229 */
00230 template<class Random = MersenneTwister >
00231 class Sampler
00232 {
00233   public:
00234         /** Internal type of the indices.
00235             Currently, 64-bit indices are not supported because this
00236             requires extension of the random number generator classes.
00237         */
00238     typedef Int32                               IndexType;
00239     
00240     typedef ArrayVector     <IndexType>  IndexArrayType;
00241     
00242         /** Type of the array view object that is returned by 
00243             sampledIndices() and oobIndices().
00244         */
00245     typedef ArrayVectorView <IndexType>  IndexArrayViewType;
00246 
00247   private:
00248     typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
00249     typedef std::map<IndexType, int> StrataSizesType;
00250     typedef ArrayVector     <bool>       IsUsedArrayType;
00251     typedef ArrayVectorView <bool>       IsUsedArrayViewType;
00252     
00253     static const int oobInvalid = -1;
00254 
00255     int total_count_, sample_size_;
00256     mutable int current_oob_count_;
00257     StrataIndicesType     strata_indices_;
00258     StrataSizesType       strata_sample_size_;
00259     IndexArrayType        current_sample_;
00260     mutable IndexArrayType        current_oob_sample_;
00261     IsUsedArrayType       is_used_;
00262     Random random_;
00263     SamplerOptions options_;
00264 
00265     void initStrataCount()
00266     {
00267         // compute how many samples to take from each stratum
00268         // (may be unequal if sample_size_ is not a multiple of strataCount())
00269         int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
00270         int strata_total_count = strata_sample_size * strataCount();
00271 
00272         for(StrataIndicesType::iterator i = strata_indices_.begin(); 
00273              i != strata_indices_.end(); ++i)
00274         {
00275             if(strata_total_count > sample_size_)
00276             {
00277                 strata_sample_size_[i->first] = strata_sample_size - 1;
00278                 --strata_total_count;
00279             }
00280             else
00281             {
00282                 strata_sample_size_[i->first] = strata_sample_size;
00283             }
00284         }
00285     }
00286 
00287   public:
00288     
00289         /** Create a sampler for \a totalCount data objects.
00290             
00291             In each invocation of <tt>sample()</tt> below, it will sample
00292             indices according to the options passed. If no options are given, 
00293             <tt>totalCount</tt> indices will be drawn with replacement.
00294         */
00295     Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 
00296             Random const & rnd = Random(RandomSeed))
00297     : total_count_(totalCount),
00298       sample_size_(opt.sample_size == 0
00299                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00300                          : opt.sample_size),
00301       current_oob_count_(oobInvalid),
00302       current_sample_(sample_size_),
00303       current_oob_sample_(total_count_),
00304       is_used_(total_count_),
00305       random_(rnd),
00306       options_(opt)
00307     {
00308         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00309           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00310           
00311         vigra_precondition(!opt.stratified_sampling,
00312           "Sampler(): Stratified sampling requested, but no strata given.");
00313           
00314         // initialize a single stratum containing all data
00315         strata_indices_[0].resize(total_count_);
00316         for(int i=0; i<total_count_; ++i)
00317             strata_indices_[0][i] = i;
00318 
00319         initStrataCount();
00320         sample();
00321     }
00322     
00323         /** Create a sampler for stratified sampling.
00324             
00325             <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 
00326             which specifies for each sample the stratum it belongs to. The
00327             total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
00328             Equally many samples (subject to rounding) will be drawn from each stratum, 
00329             unless the option object explicitly requests unstratified sampling, 
00330             in which case the strata are ignored.
00331         */
00332     template <class Iterator>
00333     Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 
00334             Random const & rnd = Random(RandomSeed))
00335     : total_count_(strataEnd - strataBegin),
00336       sample_size_(opt.sample_size == 0
00337                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00338                          : opt.sample_size),
00339       current_oob_count_(oobInvalid),
00340       current_sample_(sample_size_),
00341       current_oob_sample_(total_count_),
00342       is_used_(total_count_),
00343       random_(rnd),
00344       options_(opt)
00345     {
00346         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00347           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00348           
00349         // copy the strata indices
00350         for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
00351         {
00352             strata_indices_[*strataBegin].push_back(i);
00353         }
00354         vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
00355             "Sampler(): Requested sample count must be at least as large as the number of strata.");
00356 
00357         initStrataCount();
00358         sample();
00359     }
00360 
00361         /** Return the k-th index in the current sample.
00362          */
00363     IndexType operator[](int k) const
00364     {
00365         return current_sample_[k];
00366     }
00367 
00368         /** Create a new sample.
00369          */
00370     void sample();
00371 
00372         /** The total number of data elements.
00373          */
00374     int totalCount() const
00375     {
00376         return total_count_;
00377     }
00378 
00379         /** The number of data elements that have been sampled.
00380          */
00381     int sampleSize() const
00382     {
00383         return sample_size_;
00384     }
00385 
00386         /** Same as sampleSize().
00387          */
00388     int size() const
00389     {
00390         return sample_size_;
00391     }
00392 
00393         /** The number of strata to be used.
00394             Will be 1 if no strata are given. Will be ognored when
00395             stratifiedSampling() is false.
00396          */
00397     int strataCount() const
00398     {
00399         return strata_indices_.size();
00400     }
00401 
00402         /** Whether to use stratified sampling.
00403             (If this is false, strata will be ignored even if present.)
00404          */
00405     bool stratifiedSampling() const
00406     {
00407         return options_.stratified_sampling;
00408     }
00409     
00410         /** Whether sampling should be performed with replacement.
00411          */
00412     bool withReplacement() const
00413     {
00414         return options_.sample_with_replacement;
00415     }
00416     
00417         /** Return an array view containing the indices in the current sample.
00418          */
00419     IndexArrayViewType sampledIndices() const
00420     {
00421         return current_sample_;
00422     }
00423     
00424         /** Return an array view containing the out-of-bag indices.
00425             (i.e. the indices that are not in the current sample)
00426          */
00427     IndexArrayViewType oobIndices() const
00428     {
00429         if(current_oob_count_ == oobInvalid)
00430         {
00431             current_oob_count_ = 0;
00432             for(int i = 0; i<total_count_; ++i)
00433             {
00434                 if(!is_used_[i])
00435                 {
00436                     current_oob_sample_[current_oob_count_] = i;
00437                     ++current_oob_count_;
00438                 }
00439             }
00440         }
00441         return current_oob_sample_.subarray(0, current_oob_count_);
00442     }
00443 };
00444 
00445 
00446 template<class Random>
00447 void Sampler<Random>::sample()
00448 {
00449     current_oob_count_ = oobInvalid;
00450     is_used_.init(false);
00451     
00452     if(options_.sample_with_replacement)
00453     {
00454         //Go thru all strata
00455         int j = 0;
00456         StrataIndicesType::iterator iter;
00457         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00458         {
00459             // do sampling with replacement in each strata and copy data.
00460             int stratum_size = iter->second.size();
00461             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00462             {
00463                 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
00464                 is_used_[current_sample_[j]] = true;
00465             }
00466         }
00467     }
00468     else
00469     {
00470         //Go thru all strata
00471         int j = 0;
00472         StrataIndicesType::iterator iter;
00473         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00474         {
00475             // do sampling without replacement in each strata and copy data.
00476             int stratum_size = iter->second.size();
00477             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00478             {
00479                 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
00480                 current_sample_[j] = iter->second[i];
00481                 is_used_[current_sample_[j]] = true;
00482             }
00483         }
00484     }
00485 }
00486 
00487 
00488 //@}
00489 
00490 } // namespace vigra
00491 
00492 #endif /*VIGRA_INDEX_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.7.1 (28 Sep 2010)