Project 4

Download the file bodyfat.csv. This is a dataset of body fat, age, height, and weight for a set of participants in a study. BMI categories are as follows:

|Severely underweight | BMI < 16.0 | |Underweight | 16 <= BMI < 18.5 | |Normal | 18.5 <= BMI < 25 | |Overweight | 25 <= BMI < 30 | |Obese Class I | 30 <= BMI < 35 | |Obese Class II | 35 <= BMI < 40 | |Obese Class III | BMI > 40 |

Write a bmistats module containing functions for the following:

  1. Convert pounds to kilograms. Use the actual conversion factor, not the approximate one. Look it up on Google.
  2. Convert feet/inches to meters. Look up the conversion factor, do not guess at it.
  3. Compute BMI.
  4. Determine where the BMI falls in the table supplied and return that information an appropriate form.

Write a file stats that implements the following:

  1. Mean of an array
  2. Standard deviation of an array
  3. Outlier rejection using Chauvenet’s criterion. Pseudocode given further down.

Write a main program that implements the following:

  1. Uses your other files
  2. Reads the input file into appropriate arrays (use one-dimensional arrays for this project). Don’t assume you know the length of the file (but you can assume the number of header lines is fixed).
  3. Pass appropriate arrays to a subroutine that computes an array of BMI data based on height and weight and returns the BMI array.
  4. Rejects the outlier(s). The function should return an array of logicals that you can apply to the original data using WHERE or similar. Create new arrays with the outlier(s) deleted.

Write a file that contains the corrected data for bodyfat and BMI. Use Excel or whatever you normally use to plot BMI as a function of percentage body fat. Be sure to plot it as a scatter plot (points only, no connecting lines).

Chauvenet’s criterion: It’s not the state of the art but works pretty well.

  1. Compute the mean and standard deviations of the observations.
  2. Compute the absolute values of the deviations, i.e. abs(A-mean(A))/std(A)
  3. Use the tails devs=devs/sqrt(2.)
  4. Compute the probabilities prob=erfc(devs) : erfc is an intrinsic in any fairly recent Fortran compiler.
  5. The criterion is that we retain data with prob>=1./(2*N_obs) (number of observations).
Example solution

#include <cmath>
#include <iostream>

using namespace std;

float mean(float *A, int n) {
    
    float sum=0.;
    for (int i=0;i<n;++i) {
        sum+=A[i];
    }
    return (sum/float(n));
}

float stdv(float *A, int n) {
     
    float mean_A=mean(A,n);
    float sumsqr=0.;
    for (int i=0;i<n;++i) {
         sumsqr+=pow(A[i]-mean_A,2);
    }
    return sqrt(sumsqr/n);
}

int reject_outliers(float *A, bool* mask, int n) {
     
    float dev, prob;

    float mean_A=mean(A,n);
    float stdv_A=stdv(A,n);
    float criterion=1.0/(2.*n);
    float sqrt2=sqrt(2.0);
    int numRejected=0;
    for (int i=0; i<n; ++i) {
        dev=abs(A[i]-mean_A)/stdv_A;
        dev/=sqrt2;
        prob=erfc(dev);
        if (prob>=criterion) {
           mask[i]=true;
        }
        else {
           mask[i]=false;
           numRejected++;
        }
   }
   return numRejected;
}
     


float convert_weight(float lbs);

float convert_height(float feet, float inches);

float calculate_bmi(float ht, float wt);

int bmi_table(float bmi);

#include<iostream>
using namespace std;
float convert_weight(float lbs) {
    float lbs2kgs=0.453592;
    return (lbs*lbs2kgs);
}

float convert_height(float feet, float inches) {
    float inch2cm=2.54;
    float ms=(feet*12.+inches)*inch2cm;
    return (ms);
}

float calculate_bmi(float ht, float wt) {
    return (wt/(ht*ht));
}

int bmi_table(float bmi) {
    float bounds[6]={16.0,18.5,25.,30.,35.,40.};

    int nCategories=7;

    if ( bmi < bounds[0] ) return 1;
    else if (bounds[0]<=bmi && bmi<bounds[1]) return 2;
    else if (bounds[1]<=bmi && bmi<bounds[2]) return 3;
    else if (bounds[2]<=bmi && bmi<bounds[3]) return 4;
    else if (bounds[3]<=bmi && bmi<bounds[4]) return 5;
    else if (bounds[4]<=bmi && bmi<bounds[5]) return 6;
    else return 7;
    //if ( bmi>=bounds[nCategories-1] ) return nCategories;
}

/*
 * bmi.cxx
 * 
 * Author:    K. Holcomb
 * Changelog: Initial version 20160308
 */

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <cmath>

#include "stats.h"
#include "bmistats.h"

using namespace std;

int main(int argc, char **argv)
{
    ifstream fin;
    string bodyfat, age, wt_lbs, ht_inch;
    string inFile;
    string line;
    float zero=0.;
    const int numCategories=7;
    
    if (argc>1) {
        inFile=argv[1];
    }
    else {
        cout <<"No file name provided\n";
        return 1;
    }
    
    fin.open(inFile.c_str());
    int lineCount=0;
    if ( fin.is_open() ) {
        while (getline(fin,line)) {
            lineCount++;
        }
    }
    fin.clear();
    fin.seekg(0);

    int nobs=lineCount-1;

    float * bf=new float[nobs];
    float * wt=new float[nobs];
    float * ht=new float[nobs];

    if ( fin.is_open() ) {
        getline(fin,line);
        int lineCount=0;
        while (getline(fin,line)) {
            stringstream lineStream(line);
            string * linevals=new string[4]; //can also use vector
            int index=0;
            while ( getline(lineStream,linevals[index],',') ) {
               ++index;
            }
            stringstream ssbf, sswt, ssht;
            ssbf<<linevals[0];
            ssbf>>bf[lineCount];
            sswt<<linevals[2];
            sswt>>wt[lineCount];
            ssht<<linevals[3];
            ssht>>ht[lineCount];
                lineCount++;
        }
    }

    float * bmi=new float[nobs];
    for (int i=0;i<nobs;++i) {
        float kgs=convert_weight(wt[i]);
        //All feet are zero for this dataset
        float ht_m=0.01*convert_height(zero,ht[i]);
        bmi[i]=calculate_bmi(ht_m,kgs);
    }

    //Apply Chauvenet criterion
    bool *mask=new bool[nobs];
    int numRejected;
    numRejected=reject_outliers(bmi,mask,nobs);
    int numValid=nobs-numRejected;

    float * bodyfat_corrected=new float[numValid];
    float * bmi_corrected=new float[numValid];

    int counter=0;
    for (int i=0;i<nobs;++i) {
        if (mask[i]) {
            bmi_corrected[counter]=bmi[i];
            bodyfat_corrected[counter]=bf[i];
        counter++;
        }
    }

    //Compute and print histogram
    int bins[numCategories]={0};
    int category;
    for (int i=0;i<numValid;++i) {
        category=bmi_table(bmi_corrected[i]);
        bins[category-1]+=1;
    }

    for (int i=0;i<numCategories;++i) {
        cout<<"Category "<<i+1<<":";
        for (int j=0;j<bins[i];++j) {
           cout<<'*';
        }
        cout<<"\n";
    }

    ofstream fout;
    fout.open("corrected_data.csv");

    fout<<"Bodyfat Percentage,BMI\n";
    for (int i=0; i<numValid; ++i) {
        fout<<bodyfat_corrected[i]<<","<<bmi_corrected[i]<<"\n";
    }

    return 0;
}

Previous
Next