rdbstats  2.0.7
RdbStatsPercentiles.cc
1 // File: RdbStatsPercentiles.cc
2 
3 // --8<--8<--8<--8<--
4 //
5 // Copyright (C) 2006 Smithsonian Astrophysical Observatory
6 //
7 // This file is part of rdbstats
8 //
9 // rdbstats is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU General Public License
11 // as published by the Free Software Foundation; either version 2
12 // of the License, or (at your option) any later version.
13 //
14 // rdbstats is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 // GNU General Public License for more details.
18 //
19 // You should have received a copy of the GNU General Public License
20 // along with this program; if not, write to the
21 // Free Software Foundation, Inc.
22 // 51 Franklin Street, Fifth Floor
23 // Boston, MA 02110-1301, USA
24 //
25 // -->8-->8-->8-->8--
26 
27 #include <sstream>
28 #include <algorithm>
29 
30 #include <suplibxx/str.h>
31 
32 #include "RdbStatsPercentiles.h"
33 
34 // The user specified percentiles. For example: --percentiles 12,34,45
35 RdbStatsPercentiles::RdbStatsPercentiles( RDB& irdbtable, const std::string& name,
36  const std::string& percent_list,
37  bool implicitquartile )
38 : RdbStats( irdbtable, name ), the_median( 0.0 ) {
39 
40  std::vector< std::string > tokens;
41 
42  suplib::tok( tokens, percent_list, "," );
43 
44  const int mysize( tokens.size( ) );
45  for ( int ii = 0; ii < mysize; ii++ ) {
46  std::string sval = std::string( "_p" ) + tokens[ ii ];
47  double dval = suplib::str2d( tokens[ ii ].c_str( ) );
48  the_percentile.push_back( std::pair< std::string, double >( sval, dval ) );
49  }
50 
51  for ( int ii = 0; ii < mysize; ii++ ) {
52  if ( the_percentile[ ii ].second <= 0.0 ||
53  the_percentile[ ii ].second > 100.0 ) {
54  std::ostringstream ost;
55  ost << "The percentiles option (" << percent_list << ") must be "
56  "within [ 0.0, 100.0 ]\n";
57  throw Exception( ost.str( ) );
58  }
59  }
60 
61  if ( implicitquartile ) {
62  the_percentile.push_back( std::pair< std::string, double >( "_fq", 25.0 ) );
63  the_percentile.push_back( std::pair< std::string, double >( "_lq", 75.0 ) );
64  }
65 }
66 
67 // --quartiles was entered at the command line.
68 RdbStatsPercentiles::RdbStatsPercentiles( RDB& irdbtable, const std::string& name )
69 : RdbStats( irdbtable, name ), the_median( 0.0 ) {
70 
71  the_percentile.push_back( std::pair< std::string, double >( "_fq", 25.0 ) );
72  the_percentile.push_back( std::pair< std::string, double >( "_lq", 75.0 ) );
73 }
74 
75 double RdbStatsPercentiles::calculate_median( ) {
76 
77  std::vector< double >::iterator begin_ptr = data.begin( );
78 
79  size_t num_size = data.size( );
80  size_t num_size_div = num_size / 2;
81 
82  nth_element( begin_ptr, begin_ptr + num_size_div, data.end( ) );
83 
84  if ( num_size & 0001 ) {
85 
86  // num_size is odd:
87 
88  return data[ num_size_div ];
89 
90  } else {
91 
92  // num_size is even:
93 
94  double tmp = data[ num_size_div ];
95 
96  //
97  // element beyond num_size_div is guarranteed
98  // to be > then element at data[ num_size_div ].
99  //
100  nth_element( begin_ptr, begin_ptr + num_size_div - 1,
101  begin_ptr + num_size_div );
102  tmp += data[ num_size_div - 1 ];
103 
104  return 0.5 * tmp;
105 
106  }
107 }
108 
109 double RdbStatsPercentiles::calculate_percentile( const double percentile ) {
110 
111  std::vector< double >::iterator begin_ptr = data.begin( );
112 
113  size_t n = data.size( );
114  double f = percentile / 100.0;
115  int i = ( ( n - 1.0 ) * f );
116  double delta = ( n - 1.0 ) * f - i;
117 
118  nth_element( begin_ptr, begin_ptr + i + 1, data.end( ) );
119  double data_i_1 = data[ i + 1];
120 
121  nth_element( begin_ptr, begin_ptr + i, data.end( ) );
122  double data_i = data[ i ];
123 
124  double quantile = ( 1.0 - delta ) * data_i + delta * data_i_1;
125 
126  /*
127  cerr << "n: " << n << endl
128  << "f: " << f << endl
129  << "i: " << i << endl
130  << "delta: " << delta << endl
131  << "data_i: " << data_i << endl
132  << "data_i_1: " << data_i_1 << endl
133  << "quantile: " << quantile << endl << endl;
134  */
135  return quantile;
136 }
137 
139 
140  if ( 0 == this->RdbStats::calculate_statistics( ) )
141  return 0;
142 
143  the_median = calculate_median( );
144 
145  for ( size_t ii = 0; ii < the_percentile.size( ); ii++ )
146  the_percentile_result[ ii ] =
147  calculate_percentile( the_percentile[ ii ].second );
148 
149  return num_n;
150 }
151 
152 void RdbStatsPercentiles::init( ) {
153  this->RdbStats::init( );
154 
155  // Since we are, possibly, starting a new group so wipe out data.
156  data.clear( );
157 }
158 
159 void RdbStatsPercentiles::normalize_results( const double norm ) {
160 
161  this->RdbStats::normalize_results( norm );
162  for ( size_t ii = 0; ii < the_percentile.size( ); ii++ )
163  the_percentile_result[ ii ] /= norm;
164 
165 }
166 
167 void RdbStatsPercentiles::set_output_columns( RDB& ordbtable ) {
168 
169  this->RdbStats::set_output_columns( ordbtable );
170 
171  const int mysize( the_percentile.size( ) );
172  the_percentile_result.reserve( mysize );
173 
174  const char* column_name_ptr = column_name.c_str( );
175 
176  char str[ 256 ];
177  sprintf( str, "%s_median", column_name_ptr );
178  ordbtable.setColumn( str, "N" );
179  RDBColumn* tmp = ordbtable.getColumn( str );
180  tmp->mapData( &the_median, 1 );
181 
182  for ( int ii = 0; ii < mysize; ii++ ) {
183 
184  std::ostringstream ost;
185  ost << column_name_ptr << the_percentile[ ii ].first;
186  ordbtable.setColumn( ost.str( ), "N" );
187  RDBColumn* ptr = ordbtable.getColumn( ost.str( ) );
188  ptr->mapData( &the_percentile_result[ii], 1 );
189 
190  }
191 }
192 
195 
196  data.push_back( get_value( ) );
197 }
The base class to calculate : average, maximum, minimum, num, stddev and sum.
Definition: RdbStats.h:39
virtual void update_statistics()
Read the column from RDB++, update the statistics for the column.
virtual void update_statistics()
Read the column from RDB++, update the statistics for the column.
Definition: RdbStats.cc:129
RdbStatsPercentiles(RDB &irdbtable, const std::string &name, const std::string &percent_list, bool implicitquartile)
–percentiles 12,34.. and –quartile
virtual int calculate_statistics()
Perform the final statistic for the set.
virtual int calculate_statistics()
Perform the final statistic for the set.
Definition: RdbStats.cc:49