[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]
vigra/random_forest/rf_algorithm.hxx | ![]() |
00001 /************************************************************************/ 00002 /* */ 00003 /* Copyright 2008-2009 by Rahul Nair */ 00004 /* */ 00005 /* This file is part of the VIGRA computer vision library. */ 00006 /* The VIGRA Website is */ 00007 /* http://hci.iwr.uni-heidelberg.de/vigra/ */ 00008 /* Please direct questions, bug reports, and contributions to */ 00009 /* ullrich.koethe@iwr.uni-heidelberg.de or */ 00010 /* vigra@informatik.uni-hamburg.de */ 00011 /* */ 00012 /* Permission is hereby granted, free of charge, to any person */ 00013 /* obtaining a copy of this software and associated documentation */ 00014 /* files (the "Software"), to deal in the Software without */ 00015 /* restriction, including without limitation the rights to use, */ 00016 /* copy, modify, merge, publish, distribute, sublicense, and/or */ 00017 /* sell copies of the Software, and to permit persons to whom the */ 00018 /* Software is furnished to do so, subject to the following */ 00019 /* conditions: */ 00020 /* */ 00021 /* The above copyright notice and this permission notice shall be */ 00022 /* included in all copies or substantial portions of the */ 00023 /* Software. */ 00024 /* */ 00025 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ 00026 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ 00027 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 00028 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ 00029 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ 00030 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ 00031 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ 00032 /* OTHER DEALINGS IN THE SOFTWARE. */ 00033 /* */ 00034 /************************************************************************/ 00035 #define VIGRA_RF_ALGORITHM_HXX 00036 00037 #include <vector> 00038 #include "splices.hxx" 00039 #include <queue> 00040 #include <fstream> 00041 namespace vigra 00042 { 00043 00044 namespace rf 00045 { 00046 /** This namespace contains all algorithms developed for feature 00047 * selection 00048 * 00049 */ 00050 namespace algorithms 00051 { 00052 00053 namespace detail 00054 { 00055 /** create a MultiArray containing only columns supplied between iterators 00056 b and e 00057 */ 00058 template<class OrigMultiArray, 00059 class Iter, 00060 class DestMultiArray> 00061 void choose(OrigMultiArray const & in, 00062 Iter const & b, 00063 Iter const & e, 00064 DestMultiArray & out) 00065 { 00066 int columnCount = std::distance(b, e); 00067 int rowCount = in.shape(0); 00068 out.reshape(MultiArrayShape<2>::type(rowCount, columnCount)); 00069 int ii = 0; 00070 for(Iter iter = b; iter != e; ++iter, ++ii) 00071 { 00072 columnVector(out, ii) = columnVector(in, *iter); 00073 } 00074 } 00075 } 00076 00077 00078 00079 /** Standard random forest Errorrate callback functor 00080 * 00081 * returns the random forest error estimate when invoked. 00082 */ 00083 class RFErrorCallback 00084 { 00085 RandomForestOptions options; 00086 00087 public: 00088 /** Default constructor 00089 * 00090 * optionally supply options to the random forest classifier 00091 * \sa RandomForestOptions 00092 */ 00093 RFErrorCallback(RandomForestOptions opt = RandomForestOptions()) 00094 : options(opt) 00095 {} 00096 00097 /** returns the RF OOB error estimate given features and 00098 * labels 00099 */ 00100 template<class Feature_t, class Response_t> 00101 double operator() (Feature_t const & features, 00102 Response_t const & response) 00103 { 00104 RandomForest<> rf(options); 00105 visitors::OOB_Error oob; 00106 rf.learn(features, 00107 response, 00108 visitors::create_visitor(oob )); 00109 return oob.oob_breiman; 00110 } 00111 }; 00112 00113 00114 /** Structure to hold Variable Selection results 00115 */ 00116 class VariableSelectionResult 00117 { 00118 bool initialized; 00119 00120 public: 00121 VariableSelectionResult() 00122 : initialized(false) 00123 {} 00124 00125 typedef std::vector<int> FeatureList_t; 00126 typedef std::vector<double> ErrorList_t; 00127 typedef FeatureList_t::iterator Pivot_t; 00128 00129 Pivot_t pivot; 00130 00131 /** list of features. 00132 */ 00133 FeatureList_t selected; 00134 00135 /** vector of size (number of features) 00136 * 00137 * the i-th entry encodes the error rate obtained 00138 * while using features [0 - i](including i) 00139 * 00140 * if the i-th entry is -1 then no error rate was obtained 00141 * this may happen if more than one feature is added to the 00142 * selected list in one step of the algorithm. 00143 * 00144 * during initialisation error[m+n-1] is always filled 00145 */ 00146 ErrorList_t errors; 00147 00148 00149 /** errorrate using no features 00150 */ 00151 double no_features; 00152 00153 template<class FeatureT, 00154 class ResponseT, 00155 class Iter, 00156 class ErrorRateCallBack> 00157 bool init(FeatureT const & all_features, 00158 ResponseT const & response, 00159 Iter b, 00160 Iter e, 00161 ErrorRateCallBack errorcallback) 00162 { 00163 bool ret_ = init(all_features, response, errorcallback); 00164 if(!ret_) 00165 return false; 00166 vigra_precondition(std::distance(b, e) == (std::ptrdiff_t)selected.size(), 00167 "Number of features in ranking != number of features matrix"); 00168 std::copy(b, e, selected.begin()); 00169 return true; 00170 } 00171 00172 template<class FeatureT, 00173 class ResponseT, 00174 class Iter> 00175 bool init(FeatureT const & all_features, 00176 ResponseT const & response, 00177 Iter b, 00178 Iter e) 00179 { 00180 RFErrorCallback ecallback; 00181 return init(all_features, response, b, e, ecallback); 00182 } 00183 00184 00185 template<class FeatureT, 00186 class ResponseT> 00187 bool init(FeatureT const & all_features, 00188 ResponseT const & response) 00189 { 00190 return init(all_features, response, RFErrorCallback()); 00191 } 00192 /**initialization routine. Will be called only once in the lifetime 00193 * of a VariableSelectionResult. Subsequent calls will not reinitialize 00194 * member variables. 00195 * 00196 * This is intended, to allow continuing variable selection at a point 00197 * stopped in an earlier iteration. 00198 * 00199 * returns true if initialization was successful and false if 00200 * the object was already initialized before. 00201 */ 00202 template<class FeatureT, 00203 class ResponseT, 00204 class ErrorRateCallBack> 00205 bool init(FeatureT const & all_features, 00206 ResponseT const & response, 00207 ErrorRateCallBack errorcallback) 00208 { 00209 if(initialized) 00210 { 00211 return false; 00212 } 00213 initialized = true; 00214 // calculate error with all features 00215 selected.resize(all_features.shape(1), 0); 00216 for(unsigned int ii = 0; ii < selected.size(); ++ii) 00217 selected[ii] = ii; 00218 errors.resize(all_features.shape(1), -1); 00219 errors.back() = errorcallback(all_features, response); 00220 00221 // calculate error rate if no features are chosen 00222 // corresponds to max(prior probability) of the classes 00223 std::map<typename ResponseT::value_type, int> res_map; 00224 std::vector<int> cts; 00225 int counter = 0; 00226 for(int ii = 0; ii < response.shape(0); ++ii) 00227 { 00228 if(res_map.find(response(ii, 0)) == res_map.end()) 00229 { 00230 res_map[response(ii, 0)] = counter; 00231 ++counter; 00232 cts.push_back(0); 00233 } 00234 cts[res_map[response(ii,0)]] +=1; 00235 } 00236 no_features = double(*(std::max_element(cts.begin(), 00237 cts.end()))) 00238 / double(response.shape(0)); 00239 00240 /*init not_selected vector; 00241 not_selected.resize(all_features.shape(1), 0); 00242 for(int ii = 0; ii < not_selected.size(); ++ii) 00243 { 00244 not_selected[ii] = ii; 00245 } 00246 initialized = true; 00247 */ 00248 pivot = selected.begin(); 00249 return true; 00250 } 00251 }; 00252 00253 00254 00255 /** Perform forward selection 00256 * 00257 * \param features IN: n x p matrix containing n instances with p attributes/features 00258 * used in the variable selection algorithm 00259 * \param response IN: n x 1 matrix containing the corresponding response 00260 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00261 * of the algorithm. 00262 * Features between result.selected.begin() and result.pivot will 00263 * be left untouched. 00264 * \sa VariableSelectionResult 00265 * \param errorcallback 00266 * IN, OPTIONAL: 00267 * Functor that returns the error rate given a set of 00268 * features and labels. Default is the RandomForest OOB Error. 00269 * 00270 * Forward selection subsequently chooses the next feature that decreases the Error rate most. 00271 * 00272 * usage: 00273 * \code 00274 * MultiArray<2, double> features = createSomeFeatures(); 00275 * MultiArray<2, int> labels = createCorrespondingLabels(); 00276 * VariableSelectionResult result; 00277 * forward_selection(features, labels, result); 00278 * \endcode 00279 * To use forward selection but ensure that a specific feature e.g. feature 5 is always 00280 * included one would do the following 00281 * 00282 * \code 00283 * VariableSelectionResult result; 00284 * result.init(features, labels); 00285 * std::swap(result.selected[0], result.selected[5]); 00286 * result.setPivot(1); 00287 * forward_selection(features, labels, result); 00288 * \endcode 00289 * 00290 * \sa VariableSelectionResult 00291 * 00292 */ 00293 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00294 void forward_selection(FeatureT const & features, 00295 ResponseT const & response, 00296 VariableSelectionResult & result, 00297 ErrorRateCallBack errorcallback) 00298 { 00299 VariableSelectionResult::FeatureList_t & selected = result.selected; 00300 VariableSelectionResult::ErrorList_t & errors = result.errors; 00301 VariableSelectionResult::Pivot_t & pivot = result.pivot; 00302 int featureCount = features.shape(1); 00303 // initialize result struct if in use for the first time 00304 if(!result.init(features, response, errorcallback)) 00305 { 00306 //result is being reused just ensure that the number of features is 00307 //the same. 00308 vigra_precondition((int)selected.size() == featureCount, 00309 "forward_selection(): Number of features in Feature " 00310 "matrix and number of features in previously used " 00311 "result struct mismatch!"); 00312 } 00313 00314 00315 int not_selected_size = std::distance(pivot, selected.end()); 00316 while(not_selected_size > 1) 00317 { 00318 std::vector<double> current_errors; 00319 VariableSelectionResult::Pivot_t next = pivot; 00320 for(int ii = 0; ii < not_selected_size; ++ii, ++next) 00321 { 00322 std::swap(*pivot, *next); 00323 MultiArray<2, double> cur_feats; 00324 detail::choose( features, 00325 selected.begin(), 00326 pivot+1, 00327 cur_feats); 00328 double error = errorcallback(cur_feats, response); 00329 current_errors.push_back(error); 00330 std::swap(*pivot, *next); 00331 } 00332 int pos = std::distance(current_errors.begin(), 00333 std::min_element(current_errors.begin(), 00334 current_errors.end())); 00335 next = pivot; 00336 std::advance(next, pos); 00337 std::swap(*pivot, *next); 00338 errors[std::distance(selected.begin(), pivot)] = current_errors[pos]; 00339 #ifdef RN_VERBOSE 00340 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", ")); 00341 std::cerr << "Choosing " << *pivot << " at error of " << current_errors[pos] << std::endl; 00342 #endif 00343 ++pivot; 00344 not_selected_size = std::distance(pivot, selected.end()); 00345 } 00346 } 00347 template<class FeatureT, class ResponseT> 00348 void forward_selection(FeatureT const & features, 00349 ResponseT const & response, 00350 VariableSelectionResult & result) 00351 { 00352 forward_selection(features, response, result, RFErrorCallback()); 00353 } 00354 00355 00356 /** Perform backward elimination 00357 * 00358 * \param features IN: n x p matrix containing n instances with p attributes/features 00359 * used in the variable selection algorithm 00360 * \param response IN: n x 1 matrix containing the corresponding response 00361 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00362 * of the algorithm. 00363 * Features between result.pivot and result.selected.end() will 00364 * be left untouched. 00365 * \sa VariableSelectionResult 00366 * \param errorcallback 00367 * IN, OPTIONAL: 00368 * Functor that returns the error rate given a set of 00369 * features and labels. Default is the RandomForest OOB Error. 00370 * 00371 * Backward elimination subsequently eliminates features that have the least influence 00372 * on the error rate 00373 * 00374 * usage: 00375 * \code 00376 * MultiArray<2, double> features = createSomeFeatures(); 00377 * MultiArray<2, int> labels = createCorrespondingLabels(); 00378 * VariableSelectionResult result; 00379 * backward_elimination(features, labels, result); 00380 * \endcode 00381 * To use backward elimination but ensure that a specific feature e.g. feature 5 is always 00382 * excluded one would do the following: 00383 * 00384 * \code 00385 * VariableSelectionResult result; 00386 * result.init(features, labels); 00387 * std::swap(result.selected[result.selected.size()-1], result.selected[5]); 00388 * result.setPivot(result.selected.size()-1); 00389 * backward_elimination(features, labels, result); 00390 * \endcode 00391 * 00392 * \sa VariableSelectionResult 00393 * 00394 */ 00395 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00396 void backward_elimination(FeatureT const & features, 00397 ResponseT const & response, 00398 VariableSelectionResult & result, 00399 ErrorRateCallBack errorcallback) 00400 { 00401 int featureCount = features.shape(1); 00402 VariableSelectionResult::FeatureList_t & selected = result.selected; 00403 VariableSelectionResult::ErrorList_t & errors = result.errors; 00404 VariableSelectionResult::Pivot_t & pivot = result.pivot; 00405 00406 // initialize result struct if in use for the first time 00407 if(!result.init(features, response, errorcallback)) 00408 { 00409 //result is being reused just ensure that the number of features is 00410 //the same. 00411 vigra_precondition((int)selected.size() == featureCount, 00412 "backward_elimination(): Number of features in Feature " 00413 "matrix and number of features in previously used " 00414 "result struct mismatch!"); 00415 } 00416 pivot = selected.end() - 1; 00417 00418 int selected_size = std::distance(selected.begin(), pivot); 00419 while(selected_size > 1) 00420 { 00421 VariableSelectionResult::Pivot_t next = selected.begin(); 00422 std::vector<double> current_errors; 00423 for(int ii = 0; ii < selected_size; ++ii, ++next) 00424 { 00425 std::swap(*pivot, *next); 00426 MultiArray<2, double> cur_feats; 00427 detail::choose( features, 00428 selected.begin(), 00429 pivot+1, 00430 cur_feats); 00431 double error = errorcallback(cur_feats, response); 00432 current_errors.push_back(error); 00433 std::swap(*pivot, *next); 00434 } 00435 int pos = std::distance(current_errors.begin(), 00436 std::min_element(current_errors.begin(), 00437 current_errors.end())); 00438 next = selected.begin(); 00439 std::advance(next, pos); 00440 std::swap(*pivot, *next); 00441 // std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl; 00442 errors[std::distance(selected.begin(), pivot)-1] = current_errors[pos]; 00443 selected_size = std::distance(selected.begin(), pivot); 00444 #ifdef RN_VERBOSE 00445 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", ")); 00446 std::cerr << "Eliminating " << *pivot << " at error of " << current_errors[pos] << std::endl; 00447 #endif 00448 --pivot; 00449 } 00450 } 00451 00452 template<class FeatureT, class ResponseT> 00453 void backward_elimination(FeatureT const & features, 00454 ResponseT const & response, 00455 VariableSelectionResult & result) 00456 { 00457 backward_elimination(features, response, result, RFErrorCallback()); 00458 } 00459 00460 /** Perform rank selection using a predefined ranking 00461 * 00462 * \param features IN: n x p matrix containing n instances with p attributes/features 00463 * used in the variable selection algorithm 00464 * \param response IN: n x 1 matrix containing the corresponding response 00465 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00466 * of the algorithm. The struct should be initialized with the 00467 * predefined ranking. 00468 * 00469 * \sa VariableSelectionResult 00470 * \param errorcallback 00471 * IN, OPTIONAL: 00472 * Functor that returns the error rate given a set of 00473 * features and labels. Default is the RandomForest OOB Error. 00474 * 00475 * Often some variable importance, score measure is used to create the ordering in which 00476 * variables have to be selected. This method takes such a ranking and calculates the 00477 * corresponding error rates. 00478 * 00479 * usage: 00480 * \code 00481 * MultiArray<2, double> features = createSomeFeatures(); 00482 * MultiArray<2, int> labels = createCorrespondingLabels(); 00483 * std::vector<int> ranking = createRanking(features); 00484 * VariableSelectionResult result; 00485 * result.init(features, labels, ranking.begin(), ranking.end()); 00486 * backward_elimination(features, labels, result); 00487 * \endcode 00488 * 00489 * \sa VariableSelectionResult 00490 * 00491 */ 00492 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00493 void rank_selection (FeatureT const & features, 00494 ResponseT const & response, 00495 VariableSelectionResult & result, 00496 ErrorRateCallBack errorcallback) 00497 { 00498 VariableSelectionResult::FeatureList_t & selected = result.selected; 00499 VariableSelectionResult::ErrorList_t & errors = result.errors; 00500 VariableSelectionResult::Pivot_t & iter = result.pivot; 00501 int featureCount = features.shape(1); 00502 // initialize result struct if in use for the first time 00503 if(!result.init(features, response, errorcallback)) 00504 { 00505 //result is being reused just ensure that the number of features is 00506 //the same. 00507 vigra_precondition((int)selected.size() == featureCount, 00508 "forward_selection(): Number of features in Feature " 00509 "matrix and number of features in previously used " 00510 "result struct mismatch!"); 00511 } 00512 00513 int ii = 0; 00514 for(; iter != selected.end(); ++iter) 00515 { 00516 ++ii; 00517 MultiArray<2, double> cur_feats; 00518 detail::choose( features, 00519 selected.begin(), 00520 iter+1, 00521 cur_feats); 00522 double error = errorcallback(cur_feats, response); 00523 errors[std::distance(selected.begin(), iter)] = error; 00524 #ifdef RN_VERBOSE 00525 std::copy(selected.begin(), iter+1, std::ostream_iterator<int>(std::cerr, ", ")); 00526 std::cerr << "Choosing " << *(iter+1) << " at error of " << error << std::endl; 00527 #endif 00528 00529 } 00530 } 00531 00532 template<class FeatureT, class ResponseT> 00533 void rank_selection (FeatureT const & features, 00534 ResponseT const & response, 00535 VariableSelectionResult & result) 00536 { 00537 rank_selection(features, response, result, RFErrorCallback()); 00538 } 00539 00540 00541 00542 enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99}; 00543 00544 /* View of a Node in the hierarchical clustering 00545 * class 00546 * For internal use only - 00547 * \sa NodeBase 00548 */ 00549 class ClusterNode 00550 : public NodeBase 00551 { 00552 public: 00553 00554 typedef NodeBase BT; 00555 00556 /**constructors **/ 00557 ClusterNode():NodeBase(){} 00558 ClusterNode( int nCol, 00559 BT::T_Container_type & topology, 00560 BT::P_Container_type & split_param) 00561 : BT(nCol + 5, 5,topology, split_param) 00562 { 00563 status() = 0; 00564 BT::column_data()[0] = nCol; 00565 if(nCol == 1) 00566 BT::typeID() = c_Leaf; 00567 else 00568 BT::typeID() = c_Node; 00569 } 00570 00571 ClusterNode( BT::T_Container_type const & topology, 00572 BT::P_Container_type const & split_param, 00573 int n ) 00574 : NodeBase(5 , 5,topology, split_param, n) 00575 { 00576 //TODO : is there a more elegant way to do this? 00577 BT::topology_size_ += BT::column_data()[0]; 00578 } 00579 00580 ClusterNode( BT & node_) 00581 : BT(5, 5, node_) 00582 { 00583 //TODO : is there a more elegant way to do this? 00584 BT::topology_size_ += BT::column_data()[0]; 00585 BT::parameter_size_ += 0; 00586 } 00587 int index() 00588 { 00589 return static_cast<int>(BT::parameters_begin()[1]); 00590 } 00591 void set_index(int in) 00592 { 00593 BT::parameters_begin()[1] = in; 00594 } 00595 double& mean() 00596 { 00597 return BT::parameters_begin()[2]; 00598 } 00599 double& stdev() 00600 { 00601 return BT::parameters_begin()[3]; 00602 } 00603 double& status() 00604 { 00605 return BT::parameters_begin()[4]; 00606 } 00607 }; 00608 00609 /** Stackentry class for HClustering class 00610 */ 00611 struct HC_Entry 00612 { 00613 int parent; 00614 int level; 00615 int addr; 00616 bool infm; 00617 HC_Entry(int p, int l, int a, bool in) 00618 : parent(p), level(l), addr(a), infm(in) 00619 {} 00620 }; 00621 00622 00623 /** Hierarchical Clustering class. 00624 * Performs single linkage clustering 00625 * \code 00626 * Matrix<double> distance = get_distance_matrix(); 00627 * linkage.cluster(distance); 00628 * // Draw clustering tree. 00629 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 00630 * linkage.breadth_first_traversal(draw); 00631 * \endcode 00632 * \sa ClusterImportanceVisitor 00633 * 00634 * once the clustering has taken place. Information queries can be made 00635 * using the breadth_first_traversal() method and iterate() method 00636 * 00637 */ 00638 class HClustering 00639 { 00640 public: 00641 typedef MultiArrayShape<2>::type Shp; 00642 ArrayVector<int> topology_; 00643 ArrayVector<double> parameters_; 00644 int begin_addr; 00645 00646 // Calculates the distance between two 00647 double dist_func(double a, double b) 00648 { 00649 return std::min(a, b); 00650 } 00651 00652 /** Visit each node with a Functor 00653 * in creation order (should be depth first) 00654 */ 00655 template<class Functor> 00656 void iterate(Functor & tester) 00657 { 00658 00659 std::vector<int> stack; 00660 stack.push_back(begin_addr); 00661 while(!stack.empty()) 00662 { 00663 ClusterNode node(topology_, parameters_, stack.back()); 00664 stack.pop_back(); 00665 if(!tester(node)) 00666 { 00667 if(node.columns_size() != 1) 00668 { 00669 stack.push_back(node.child(0)); 00670 stack.push_back(node.child(1)); 00671 } 00672 } 00673 } 00674 } 00675 00676 /** Perform breadth first traversal of hierarchical cluster tree 00677 */ 00678 template<class Functor> 00679 void breadth_first_traversal(Functor & tester) 00680 { 00681 00682 std::queue<HC_Entry> queue; 00683 int level = 0; 00684 int parent = -1; 00685 int addr = -1; 00686 bool infm = false; 00687 queue.push(HC_Entry(parent,level,begin_addr, infm)); 00688 while(!queue.empty()) 00689 { 00690 level = queue.front().level; 00691 parent = queue.front().parent; 00692 addr = queue.front().addr; 00693 infm = queue.front().infm; 00694 ClusterNode node(topology_, parameters_, queue.front().addr); 00695 ClusterNode parnt; 00696 if(parent != -1) 00697 { 00698 parnt = ClusterNode(topology_, parameters_, parent); 00699 } 00700 queue.pop(); 00701 bool istrue = tester(node, level, parnt, infm); 00702 if(node.columns_size() != 1) 00703 { 00704 queue.push(HC_Entry(addr, level +1,node.child(0),istrue)); 00705 queue.push(HC_Entry(addr, level +1,node.child(1),istrue)); 00706 } 00707 } 00708 } 00709 /**save to HDF5 - defunct - has to be updated to new HDF5 interface 00710 */ 00711 #ifdef HasHDF5 00712 void save(std::string file, std::string prefix) 00713 { 00714 00715 vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(), 00716 MultiArrayView<2, int>( 00717 Shp(topology_.size(),1), 00718 topology_.data())); 00719 vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(), 00720 MultiArrayView<2, double>( 00721 Shp(parameters_.size(), 1), 00722 parameters_.data())); 00723 vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(), 00724 MultiArrayView<2, int>(Shp(1,1), &begin_addr)); 00725 00726 } 00727 #endif 00728 00729 /**Perform single linkage clustering 00730 * \param distance distance matrix used. \sa CorrelationVisitor 00731 */ 00732 template<class T, class C> 00733 void cluster(MultiArrayView<2, T, C> distance) 00734 { 00735 MultiArray<2, T> dist(distance); 00736 std::vector<std::pair<int, int> > addr; 00737 typedef std::pair<int, int> Entry; 00738 int index = 0; 00739 for(int ii = 0; ii < distance.shape(0); ++ii) 00740 { 00741 addr.push_back(std::make_pair(topology_.size(), ii)); 00742 ClusterNode leaf(1, topology_, parameters_); 00743 leaf.set_index(index); 00744 ++index; 00745 leaf.columns_begin()[0] = ii; 00746 } 00747 00748 while(addr.size() != 1) 00749 { 00750 //find the two nodes with the smallest distance 00751 int ii_min = 0; 00752 int jj_min = 1; 00753 double min_dist = dist((addr.begin()+ii_min)->second, 00754 (addr.begin()+jj_min)->second); 00755 for(unsigned int ii = 0; ii < addr.size(); ++ii) 00756 { 00757 for(unsigned int jj = ii+1; jj < addr.size(); ++jj) 00758 { 00759 if( dist((addr.begin()+ii_min)->second, 00760 (addr.begin()+jj_min)->second) 00761 > dist((addr.begin()+ii)->second, 00762 (addr.begin()+jj)->second)) 00763 { 00764 min_dist = dist((addr.begin()+ii)->second, 00765 (addr.begin()+jj)->second); 00766 ii_min = ii; 00767 jj_min = jj; 00768 } 00769 } 00770 } 00771 00772 //merge two nodes 00773 int col_size = 0; 00774 // The problem is that creating a new node invalidates the iterators stored 00775 // in firstChild and secondChild. 00776 { 00777 ClusterNode firstChild(topology_, 00778 parameters_, 00779 (addr.begin() +ii_min)->first); 00780 ClusterNode secondChild(topology_, 00781 parameters_, 00782 (addr.begin() +jj_min)->first); 00783 col_size = firstChild.columns_size() + secondChild.columns_size(); 00784 } 00785 int cur_addr = topology_.size(); 00786 begin_addr = cur_addr; 00787 // std::cerr << col_size << std::endl; 00788 ClusterNode parent(col_size, 00789 topology_, 00790 parameters_); 00791 ClusterNode firstChild(topology_, 00792 parameters_, 00793 (addr.begin() +ii_min)->first); 00794 ClusterNode secondChild(topology_, 00795 parameters_, 00796 (addr.begin() +jj_min)->first); 00797 parent.parameters_begin()[0] = min_dist; 00798 parent.set_index(index); 00799 ++index; 00800 std::merge(firstChild.columns_begin(), firstChild.columns_end(), 00801 secondChild.columns_begin(),secondChild.columns_end(), 00802 parent.columns_begin()); 00803 //merge nodes in addr 00804 int to_keep; 00805 int to_desc; 00806 int ii_keep; 00807 if(*parent.columns_begin() == *firstChild.columns_begin()) 00808 { 00809 parent.child(0) = (addr.begin()+ii_min)->first; 00810 parent.child(1) = (addr.begin()+jj_min)->first; 00811 (addr.begin()+ii_min)->first = cur_addr; 00812 ii_keep = ii_min; 00813 to_keep = (addr.begin()+ii_min)->second; 00814 to_desc = (addr.begin()+jj_min)->second; 00815 addr.erase(addr.begin()+jj_min); 00816 } 00817 else 00818 { 00819 parent.child(1) = (addr.begin()+ii_min)->first; 00820 parent.child(0) = (addr.begin()+jj_min)->first; 00821 (addr.begin()+jj_min)->first = cur_addr; 00822 ii_keep = jj_min; 00823 to_keep = (addr.begin()+jj_min)->second; 00824 to_desc = (addr.begin()+ii_min)->second; 00825 addr.erase(addr.begin()+ii_min); 00826 } 00827 //update distances; 00828 00829 for(int jj = 0 ; jj < (int)addr.size(); ++jj) 00830 { 00831 if(jj == ii_keep) 00832 continue; 00833 double bla = dist_func( 00834 dist(to_desc, (addr.begin()+jj)->second), 00835 dist((addr.begin()+ii_keep)->second, 00836 (addr.begin()+jj)->second)); 00837 00838 dist((addr.begin()+ii_keep)->second, 00839 (addr.begin()+jj)->second) = bla; 00840 dist((addr.begin()+jj)->second, 00841 (addr.begin()+ii_keep)->second) = bla; 00842 } 00843 } 00844 } 00845 00846 }; 00847 00848 00849 /** Normalize the status value in the HClustering tree (HClustering Visitor) 00850 */ 00851 class NormalizeStatus 00852 { 00853 public: 00854 double n; 00855 /** Constructor 00856 * \param m normalize status() by m 00857 */ 00858 NormalizeStatus(double m) 00859 :n(m) 00860 {} 00861 template<class Node> 00862 bool operator()(Node& node) 00863 { 00864 node.status()/=n; 00865 return false; 00866 } 00867 }; 00868 00869 00870 /** Perform Permutation importance on HClustering clusters 00871 * (See visit_after_tree() method of visitors::VariableImportance to 00872 * see the basic idea. (Just that we apply the permutation not only to 00873 * variables but also to clusters)) 00874 */ 00875 template<class Iter, class DT> 00876 class PermuteCluster 00877 { 00878 public: 00879 typedef MultiArrayShape<2>::type Shp; 00880 Matrix<double> tmp_mem_; 00881 MultiArrayView<2, double> perm_imp; 00882 MultiArrayView<2, double> orig_imp; 00883 Matrix<double> feats_; 00884 Matrix<int> labels_; 00885 const int nPerm; 00886 DT const & dt; 00887 int index; 00888 int oob_size; 00889 00890 template<class Feat_T, class Label_T> 00891 PermuteCluster(Iter a, 00892 Iter b, 00893 Feat_T const & feats, 00894 Label_T const & labls, 00895 MultiArrayView<2, double> p_imp, 00896 MultiArrayView<2, double> o_imp, 00897 int np, 00898 DT const & dt_) 00899 :tmp_mem_(_spl(a, b).size(), feats.shape(1)), 00900 perm_imp(p_imp), 00901 orig_imp(o_imp), 00902 feats_(_spl(a,b).size(), feats.shape(1)), 00903 labels_(_spl(a,b).size(),1), 00904 nPerm(np), 00905 dt(dt_), 00906 index(0), 00907 oob_size(b-a) 00908 { 00909 copy_splice(_spl(a,b), 00910 _spl(feats.shape(1)), 00911 feats, 00912 feats_); 00913 copy_splice(_spl(a,b), 00914 _spl(labls.shape(1)), 00915 labls, 00916 labels_); 00917 } 00918 00919 template<class Node> 00920 bool operator()(Node& node) 00921 { 00922 tmp_mem_ = feats_; 00923 RandomMT19937 random; 00924 int class_count = perm_imp.shape(1) - 1; 00925 //permute columns together 00926 for(int kk = 0; kk < nPerm; ++kk) 00927 { 00928 tmp_mem_ = feats_; 00929 for(int ii = 0; ii < rowCount(feats_); ++ii) 00930 { 00931 int index = random.uniformInt(rowCount(feats_) - ii) +ii; 00932 for(int jj = 0; jj < node.columns_size(); ++jj) 00933 { 00934 if(node.columns_begin()[jj] != feats_.shape(1)) 00935 tmp_mem_(ii, node.columns_begin()[jj]) 00936 = tmp_mem_(index, node.columns_begin()[jj]); 00937 } 00938 } 00939 00940 for(int ii = 0; ii < rowCount(tmp_mem_); ++ii) 00941 { 00942 if(dt 00943 .predictLabel(rowVector(tmp_mem_, ii)) 00944 == labels_(ii, 0)) 00945 { 00946 //per class 00947 ++perm_imp(index,labels_(ii, 0)); 00948 //total 00949 ++perm_imp(index, class_count); 00950 } 00951 } 00952 } 00953 double node_status = perm_imp(index, class_count); 00954 node_status /= nPerm; 00955 node_status -= orig_imp(0, class_count); 00956 node_status *= -1; 00957 node_status /= oob_size; 00958 node.status() += node_status; 00959 ++index; 00960 00961 return false; 00962 } 00963 }; 00964 00965 /** Convert ClusteringTree into a list (HClustering visitor) 00966 */ 00967 class GetClusterVariables 00968 { 00969 public: 00970 /** NumberOfClusters x NumberOfVariables MultiArrayView containing 00971 * in each row the variable belonging to a cluster 00972 */ 00973 MultiArrayView<2, int> variables; 00974 int index; 00975 GetClusterVariables(MultiArrayView<2, int> vars) 00976 :variables(vars), index(0) 00977 {} 00978 #ifdef HasHDF5 00979 void save(std::string file, std::string prefix) 00980 { 00981 vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(), 00982 variables); 00983 } 00984 #endif 00985 00986 template<class Node> 00987 bool operator()(Node& node) 00988 { 00989 for(int ii = 0; ii < node.columns_size(); ++ii) 00990 variables(index, ii) = node.columns_begin()[ii]; 00991 ++index; 00992 return false; 00993 } 00994 }; 00995 /** corrects the status fields of a linkage Clustering (HClustering Visitor) 00996 * 00997 * such that status(currentNode) = min(status(parent), status(currentNode)) 00998 * \sa cluster_permutation_importance() 00999 */ 01000 class CorrectStatus 01001 { 01002 public: 01003 template<class Nde> 01004 bool operator()(Nde & cur, int level, Nde parent, bool infm) 01005 { 01006 if(parent.hasData_) 01007 cur.status() = std::min(parent.status(), cur.status()); 01008 return true; 01009 } 01010 }; 01011 01012 01013 /** draw current linkage Clustering (HClustering Visitor) 01014 * 01015 * create a graphviz .dot file 01016 * usage: 01017 * \code 01018 * Matrix<double> distance = get_distance_matrix(); 01019 * linkage.cluster(distance); 01020 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 01021 * linkage.breadth_first_traversal(draw); 01022 * \endcode 01023 */ 01024 template<class T1, 01025 class T2, 01026 class C1 = UnstridedArrayTag, 01027 class C2 = UnstridedArrayTag> 01028 class Draw 01029 { 01030 public: 01031 typedef MultiArrayShape<2>::type Shp; 01032 MultiArrayView<2, T1, C1> const & features_; 01033 MultiArrayView<2, T2, C2> const & labels_; 01034 std::ofstream graphviz; 01035 01036 01037 Draw(MultiArrayView<2, T1, C1> const & features, 01038 MultiArrayView<2, T2, C2> const& labels, 01039 std::string const gz) 01040 :features_(features), labels_(labels), 01041 graphviz(gz.c_str(), std::ios::out) 01042 { 01043 graphviz << "digraph G\n{\n node [shape=\"record\"]"; 01044 } 01045 ~Draw() 01046 { 01047 graphviz << "\n}\n"; 01048 graphviz.close(); 01049 } 01050 01051 template<class Nde> 01052 bool operator()(Nde & cur, int level, Nde parent, bool infm) 01053 { 01054 graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n"; 01055 graphviz << " status: " << cur.status() << "\\n"; 01056 for(int kk = 0; kk < cur.columns_size(); ++kk) 01057 { 01058 graphviz << cur.columns_begin()[kk] << " "; 01059 if(kk % 15 == 14) 01060 graphviz << "\\n"; 01061 } 01062 graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n"; 01063 if(parent.hasData_) 01064 graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n"; 01065 return true; 01066 } 01067 }; 01068 01069 /** calculate Cluster based permutation importance while learning. (RandomForestVisitor) 01070 */ 01071 class ClusterImportanceVisitor : public visitors::VisitorBase 01072 { 01073 public: 01074 01075 /** List of variables as produced by GetClusterVariables 01076 */ 01077 MultiArray<2, int> variables; 01078 /** Corresponding importance measures 01079 */ 01080 MultiArray<2, double> cluster_importance_; 01081 /** Corresponding error 01082 */ 01083 MultiArray<2, double> cluster_stdev_; 01084 int repetition_count_; 01085 bool in_place_; 01086 HClustering & clustering; 01087 01088 01089 #ifdef HasHDF5 01090 void save(std::string filename, std::string prefix) 01091 { 01092 std::string prefix1 = "cluster_importance_" + prefix; 01093 writeHDF5(filename.c_str(), 01094 prefix1.c_str(), 01095 cluster_importance_); 01096 prefix1 = "vars_" + prefix; 01097 writeHDF5(filename.c_str(), 01098 prefix1.c_str(), 01099 variables); 01100 } 01101 #endif 01102 01103 ClusterImportanceVisitor(HClustering & clst, int rep_cnt = 10) 01104 : repetition_count_(rep_cnt), clustering(clst) 01105 01106 {} 01107 01108 /** Allocate enough memory 01109 */ 01110 template<class RF, class PR> 01111 void visit_at_beginning(RF const & rf, PR const & pr) 01112 { 01113 Int32 const class_count = rf.ext_param_.class_count_; 01114 Int32 const column_count = rf.ext_param_.column_count_+1; 01115 cluster_importance_ 01116 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01117 class_count+1)); 01118 cluster_stdev_ 01119 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01120 class_count+1)); 01121 variables 01122 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01123 column_count), -1); 01124 GetClusterVariables gcv(variables); 01125 clustering.iterate(gcv); 01126 01127 } 01128 01129 /**compute permutation based var imp. 01130 * (Only an Array of size oob_sample_count x 1 is created. 01131 * - apposed to oob_sample_count x feature_count in the other method. 01132 * 01133 * \sa FieldProxy 01134 */ 01135 template<class RF, class PR, class SM, class ST> 01136 void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & st, int index) 01137 { 01138 typedef MultiArrayShape<2>::type Shp_t; 01139 Int32 column_count = rf.ext_param_.column_count_ +1; 01140 Int32 class_count = rf.ext_param_.class_count_; 01141 01142 // remove the const cast on the features (yep , I know what I am 01143 // doing here.) data is not destroyed. 01144 typename PR::Feature_t & features 01145 = const_cast<typename PR::Feature_t &>(pr.features()); 01146 01147 //find the oob indices of current tree. 01148 ArrayVector<Int32> oob_indices; 01149 ArrayVector<Int32>::iterator 01150 iter; 01151 01152 if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000) 01153 { 01154 ArrayVector<int> cts(2, 0); 01155 ArrayVector<Int32> indices(pr.features().shape(0)); 01156 for(int ii = 0; ii < pr.features().shape(0); ++ii) 01157 indices.push_back(ii); 01158 std::random_shuffle(indices.begin(), indices.end()); 01159 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii) 01160 { 01161 if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000) 01162 { 01163 oob_indices.push_back(indices[ii]); 01164 ++cts[pr.response()(indices[ii], 0)]; 01165 } 01166 } 01167 } 01168 else 01169 { 01170 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii) 01171 if(!sm.is_used()[ii]) 01172 oob_indices.push_back(ii); 01173 } 01174 01175 // Random foo 01176 RandomMT19937 random(RandomSeed); 01177 UniformIntRandomFunctor<RandomMT19937> 01178 randint(random); 01179 01180 //make some space for the results 01181 MultiArray<2, double> 01182 oob_right(Shp_t(1, class_count + 1)); 01183 01184 // get the oob success rate with the original samples 01185 for(iter = oob_indices.begin(); 01186 iter != oob_indices.end(); 01187 ++iter) 01188 { 01189 if(rf.tree(index) 01190 .predictLabel(rowVector(features, *iter)) 01191 == pr.response()(*iter, 0)) 01192 { 01193 //per class 01194 ++oob_right[pr.response()(*iter,0)]; 01195 //total 01196 ++oob_right[class_count]; 01197 } 01198 } 01199 01200 MultiArray<2, double> 01201 perm_oob_right (Shp_t(2* column_count-1, class_count + 1)); 01202 01203 PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t> 01204 pc(oob_indices.begin(), oob_indices.end(), 01205 pr.features(), 01206 pr.response(), 01207 perm_oob_right, 01208 oob_right, 01209 repetition_count_, 01210 rf.tree(index)); 01211 clustering.iterate(pc); 01212 01213 perm_oob_right /= repetition_count_; 01214 for(int ii = 0; ii < rowCount(perm_oob_right); ++ii) 01215 rowVector(perm_oob_right, ii) -= oob_right; 01216 01217 perm_oob_right *= -1; 01218 perm_oob_right /= oob_indices.size(); 01219 cluster_importance_ += perm_oob_right; 01220 } 01221 01222 /** calculate permutation based impurity after every tree has been 01223 * learned default behaviour is that this happens out of place. 01224 * If you have very big data sets and want to avoid copying of data 01225 * set the in_place_ flag to true. 01226 */ 01227 template<class RF, class PR, class SM, class ST> 01228 void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index) 01229 { 01230 after_tree_ip_impl(rf, pr, sm, st, index); 01231 } 01232 01233 /** Normalise variable importance after the number of trees is known. 01234 */ 01235 template<class RF, class PR> 01236 void visit_at_end(RF & rf, PR & pr) 01237 { 01238 NormalizeStatus nrm(rf.tree_count()); 01239 clustering.iterate(nrm); 01240 cluster_importance_ /= rf.trees_.size(); 01241 } 01242 }; 01243 01244 /** Perform hierarchical clustering of variables and assess importance of clusters 01245 * 01246 * \param features IN: n x p matrix containing n instances with p attributes/features 01247 * used in the variable selection algorithm 01248 * \param response IN: n x 1 matrix containing the corresponding response 01249 * \param linkage OUT: Hierarchical grouping of variables. 01250 * \param distance OUT: distance matrix used for creating the linkage 01251 * 01252 * Performs Hierarchical clustering of variables. And calculates the permutation importance 01253 * measures of each of the clusters. Use the Draw functor to create human readable output 01254 * The cluster-permutation importance measure corresponds to the normal permutation importance 01255 * measure with all columns corresponding to a cluster permuted. 01256 * The importance measure for each cluster is stored as the status() field of each clusternode 01257 * \sa HClustering 01258 * 01259 * usage: 01260 * \code 01261 * MultiArray<2, double> features = createSomeFeatures(); 01262 * MultiArray<2, int> labels = createCorrespondingLabels(); 01263 * HClustering linkage; 01264 * MultiArray<2, double> distance; 01265 * cluster_permutation_importance(features, labels, linkage, distance) 01266 * // create graphviz output 01267 * 01268 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 01269 * linkage.breadth_first_traversal(draw); 01270 * 01271 * \endcode 01272 * 01273 * 01274 */ 01275 template<class FeatureT, class ResponseT> 01276 void cluster_permutation_importance(FeatureT const & features, 01277 ResponseT const & response, 01278 HClustering & linkage, 01279 MultiArray<2, double> & distance) 01280 { 01281 01282 RandomForestOptions opt; 01283 opt.tree_count(100); 01284 if(features.shape(0) > 40000) 01285 opt.samples_per_tree(20000).use_stratification(RF_EQUAL); 01286 01287 01288 vigra::RandomForest<int> RF(opt); 01289 visitors::RandomForestProgressVisitor progress; 01290 visitors::CorrelationVisitor missc; 01291 RF.learn(features, response, 01292 create_visitor(missc, progress)); 01293 distance = missc.distance; 01294 /* 01295 missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH"); 01296 */ 01297 01298 01299 // Produce linkage 01300 linkage.cluster(distance); 01301 01302 //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/"); 01303 vigra::RandomForest<int> RF2(opt); 01304 ClusterImportanceVisitor ci(linkage); 01305 RF2.learn(features, 01306 response, 01307 create_visitor(progress, ci)); 01308 01309 01310 CorrectStatus cs; 01311 linkage.breadth_first_traversal(cs); 01312 01313 //ci.save(exp_dir + dset.name() + "_result.h5", dset.name()); 01314 //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph"); 01315 //linkage.breadth_first_traversal(draw); 01316 01317 } 01318 01319 01320 template<class FeatureT, class ResponseT> 01321 void cluster_permutation_importance(FeatureT const & features, 01322 ResponseT const & response, 01323 HClustering & linkage) 01324 { 01325 MultiArray<2, double> distance; 01326 cluster_permutation_importance(features, response, linkage, distance); 01327 } 01328 01329 01330 template<class Array1, class Vector1> 01331 void get_ranking(Array1 const & in, Vector1 & out) 01332 { 01333 std::map<double, int> mymap; 01334 for(int ii = 0; ii < in.size(); ++ii) 01335 mymap[in[ii]] = ii; 01336 for(std::map<double, int>::reverse_iterator iter = mymap.rbegin(); iter!= mymap.rend(); ++iter) 01337 { 01338 out.push_back(iter->second); 01339 } 01340 } 01341 }//namespace algorithms 01342 }//namespace rf 01343 }//namespace vigra
© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de) |
html generated using doxygen and Python
|