OpenWalnut  1.5.0dev
WValueSetHistogram.h
1 //---------------------------------------------------------------------------
2 //
3 // Project: OpenWalnut ( http://www.openwalnut.org )
4 //
5 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS
6 // For more information see http://www.openwalnut.org/copying
7 //
8 // This file is part of OpenWalnut.
9 //
10 // OpenWalnut is free software: you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // OpenWalnut is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public License
21 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
22 //
23 //---------------------------------------------------------------------------
24 
25 #ifndef WVALUESETHISTOGRAM_H
26 #define WVALUESETHISTOGRAM_H
27 
28 #include <list>
29 #include <map>
30 #include <memory>
31 #include <utility>
32 
33 #include <boost/scoped_array.hpp>
34 #include <boost/shared_array.hpp>
35 
36 #include "../../common/WHistogram.h"
37 #include "../WValueSet.h"
38 
39 
40 /**
41  * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket
42  * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there
43  * always is a bucket at the end from max to infinity which holds all the max values.
44  *
45  * \note This histogram is different from from WValueSetHistogram which is a generic histogram class.
46  */
47 class WValueSetHistogram: public WHistogram // NOLINT
48 {
49 /**
50 * Only UnitTests should be friends.
51 */
53 public:
54  /**
55  * Constructor. Creates the histogram for the specified value set.
56  *
57  * \param valueSet source of the data for the histogram
58  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
59  */
60  explicit WValueSetHistogram( std::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 );
61 
62  /**
63  * Constructor. Creates the histogram for the specified value set.
64  *
65  * \param valueSet source of the data for the histogram
66  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
67  */
68  explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
69 
70  /**
71  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
72  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
73  * is especially useful to filter out outliers in data.
74  *
75  * \param valueSet source data
76  * \param min the new minimum to use
77  * \param max the maximum to use
78  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
79  */
80  WValueSetHistogram( std::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
81 
82  /**
83  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
84  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
85  * is especially useful to filter out outliers in data.
86  *
87  * \param valueSet source data
88  * \param min the new minimum to use
89  * \param max the maximum to use
90  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
91  */
92  WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
93 
94  /**
95  * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
96  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
97  *
98  * \param histogram another WValueSetHistogram
99  * \param buckets the new number of buckets. Must be larger than 1 if specified.
100  */
101  WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 );
102 
103  /**
104  * Destructor.
105  */
107 
108  /**
109  * Copy assignment. Copies the contents of the specified histogram to this instance.
110  *
111  * \param other the other instance
112  *
113  * \return this instance with the contents of the other one.
114  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
115  */
117 
118  /**
119  * Get the count of the bucket.
120  *
121  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
122  *
123  * \return elements in the bucket.
124  */
125  virtual size_t operator[]( size_t index ) const;
126 
127  /**
128  * Get the count of the bucket. Testing if the position is valid.
129  *
130  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
131  *
132  * \return elements in the bucket
133  */
134  virtual size_t at( size_t index ) const;
135 
136  /**
137  * Returns the number of buckets in the histogram with the actual mapping.
138  *
139  * \return number of buckets
140  */
141  virtual size_t size() const;
142 
143  /**
144  * Return the size of one bucket.
145  *
146  * \param index the width for this bucket is queried.
147  *
148  * \return the size of a bucket.
149  */
150  virtual double getBucketSize( size_t index = 0 ) const;
151 
152  /**
153  * Returns the actual interval associated with the given index. The interval is open, meaning that
154  * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value
155  * smaller than getIntervalForIndex( i ).second.
156  *
157  * \param index the intex
158  *
159  * \return the open interval.
160  */
161  virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
162 
163  /**
164  * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
165  * for minimum; if the value is smaller than the minimum, 0 is returned.
166  *
167  * \param value the value to search the index for
168  *
169  * \return the index of the bucket
170  */
171  virtual size_t getIndexForValue( double value ) const;
172 
173  /**
174  * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
175  *
176  * \return the number of elements distributed in the buckets.
177  */
178  virtual size_t getTotalElementCount() const;
179 
180  /**
181  * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
182  *
183  * \param startIndex the index where to start counting including this one
184  * \param endIndex the index where to end summing up excluding this one.
185  *
186  * \return the sum of all buckets in the interval.
187  * \throw WOutOfBounds if one of the indices is invalid.
188  */
189  virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
190 
191 protected:
192  /**
193  * Return the initial buckets.
194  *
195  * \return m_initialBuckets
196  */
197  boost::shared_array< size_t > getInitialBuckets() const;
198 
199  /**
200  * Return the number of initial buckets.
201  *
202  * \return m_nInitialBuckets
203  */
204  size_t getNInitialBuckets() const;
205 
206  /**
207  * Return the size of one initial bucket.
208  *
209  * \return m_bucketSize
210  */
211  double getInitialBucketSize() const;
212 
213  /**
214  * increment the value by one, contains the logic to find the element place in the array.
215  * Should only be used in the constructor i.e. while iterating over WValueSet.
216  *
217  * \param value value to increment
218  */
219  virtual void insert( double value );
220 
221 private:
222  /**
223  * Size of one bucket in the initial histogram.
224  */
226 
227  /**
228  * Pointer to all initial buckets of the histogram.
229  */
230  boost::shared_array< size_t > m_initialBuckets;
231 
232  /**
233  * Number of buckets in the initial histogram.
234  */
236 
237  /**
238  * Pointer to all initial buckets of the histogram.
239  */
240  boost::shared_array< size_t > m_mappedBuckets;
241 
242  /**
243  * Tracks the number of a buckets in the mapped histogram.
244  */
246 
247  /**
248  * Size of one bucket in the mapped histogram.
249  */
251 
252  /**
253  * The number of elements distributed in the buckets.
254  */
256 
257  /**
258  * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
259  *
260  * \param valueSet the value set.
261  */
262  void buildHistogram( const WValueSetBase& valueSet );
263 };
264 
265 /**
266  * Write a histogram in string representation to the given output stream.
267  */
268 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
269 
270 inline size_t WValueSetHistogram::getIndexForValue( double value ) const
271 {
272  // the position on the scala
273  double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
274  // the index is the floor( position )
275  size_t idx = static_cast< size_t >( std::floor( pos ) );
276 
277  // is the index larger than the size?
278  bool inU = ( idx < m_nMappedBuckets );
279  // is the index smaller than the size?
280  bool inL = ( pos >= 0.0 );
281 
282  // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
283  // always 1 if true.
284  // NOTE: this is integral arithmetic
285  return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
286 }
287 
288 #endif // WVALUESETHISTOGRAM_H
Container which associate values with (uniform width) bins (aka intervals or buckets).
Definition: WHistogram.h:36
double m_minimum
The smallest value.
Definition: WHistogram.h:122
Abstract base class to all WValueSets.
Definition: WValueSetBase.h:60
Test WValueSetHistogram.
Used to find the occurrence frequencies of values in a value set.
virtual size_t accumulate(size_t startIndex, size_t endIndex) const
Sums up the buckets in the specified interval.
double m_mappedBucketSize
Size of one bucket in the mapped histogram.
virtual size_t at(size_t index) const
Get the count of the bucket.
WValueSetHistogram(std::shared_ptr< WValueSetBase > valueSet, size_t buckets=1000)
Constructor.
boost::shared_array< size_t > getInitialBuckets() const
Return the initial buckets.
boost::shared_array< size_t > m_initialBuckets
Pointer to all initial buckets of the histogram.
size_t getNInitialBuckets() const
Return the number of initial buckets.
size_t m_nInitialBuckets
Number of buckets in the initial histogram.
virtual size_t getIndexForValue(double value) const
Returns the right index to the bucket containing the given value.
virtual void insert(double value)
increment the value by one, contains the logic to find the element place in the array.
virtual size_t getTotalElementCount() const
This returns the number of value set entries added to the histogram.
double getInitialBucketSize() const
Return the size of one initial bucket.
virtual double getBucketSize(size_t index=0) const
Return the size of one bucket.
double m_initialBucketSize
Size of one bucket in the initial histogram.
virtual size_t operator[](size_t index) const
Get the count of the bucket.
void buildHistogram(const WValueSetBase &valueSet)
Actually builds the histogram.
size_t m_nMappedBuckets
Tracks the number of a buckets in the mapped histogram.
virtual size_t size() const
Returns the number of buckets in the histogram with the actual mapping.
virtual std::pair< double, double > getIntervalForIndex(size_t index) const
Returns the actual interval associated with the given index.
WValueSetHistogram & operator=(const WValueSetHistogram &other)
Copy assignment.
size_t m_nbTotalElements
The number of elements distributed in the buckets.
~WValueSetHistogram()
Destructor.
boost::shared_array< size_t > m_mappedBuckets
Pointer to all initial buckets of the histogram.