Temperature Data Parser

57 views

Posted: 28 Jun 2021 (17:26)
Last Edited: 27 Oct 2021 (21:36)

Salil M.

@cmd05

...

Parse Temperature data for given Date and Time

656c493e455e46d7d851f7d31a48e450.png


Why parsing is needed


An important aspect of data being fit for purpose is the structure it is found in. Often, the structure itself is not suitable for the needs of the data. For example:

  • The data capture system does not have fields for each distinct piece of information with a distinct use, leading to user workarounds, such as entering many distinct pieces of information into a single free text field, or using the wrong fields for information which has no obvious place (for example, placing company information in individual contact fields).
  • The data needs to be moved to a new system, with a different data structure.
  • Duplicates need to be removed from the data, and it is difficult to identify and remove duplicates due to the data structure (for example, key address identifiers such as the Premise Number are not separated from the remainder of the address).


Alternatively, the structure of the data may be sound, but the use of it insufficiently controlled, or subject to error. For example:

  • Users are not trained to gather all the required information, causing issues such as entering contacts with ‘data cheats’ rather than real names in the name fields
  • The application displays fields in an illogical order, leading to users entering data in the wrong fields
  • Users enter duplicate records in ways that are hard to detect, such as entering inaccurate data in multiple records representing the same entity, or entering the accurate data, but in the wrong fields.


These issues all lead to poor data quality, which may in many cases be costly to the business. It is therefore important for businesses to be able to analyze data for these problems, and to resolve them where necessary.


ymd_temp_readings.txt

START
[ NYC USA 100001 ]
{ year 1990 }
{ year 1991 { month jun } }
{ year 1992 { month jan (1 0 61.5) } {month feb (1 1 64) (2 2 65.2) } }
{ year 2000
      { month feb (1 1 68) (2 3 66.66) (1 0 67.2) }
      { month dec (12 15 -9.2) (16 14 -8.8) (14 0 -2) }
}
END


Main.cpp

#include <iostream>
#include <string>
#include <algorithm>
#include <fstream>
#include <vector>
#include <map>
#include <math.h>

// celsius, ymd
constexpr int INVALID_READING = -2000;  // below abs 0
constexpr int NOT_MONTH = -1;
constexpr int implausible_min = -200;  // impossible value
constexpr int implausible_max = 200;
constexpr int min_year = 1900;
constexpr int max_year = 2021;

void end_of_loop(std::istream& ist, char term, const std::string& message) {
    if(ist.fail()) {
        ist.clear();
        char ch;
        if(ist >> ch && ch == term) return;
        throw std::runtime_error("IST fail");
    }
}

bool is_comment(std::istream& ist) {
    std::string input;
    ist >> input;
    if(input == "//") return true;
    else {
        ist.unget();
        return false;
    }
}

void throw_err(std::string message, char ch) {throw std::runtime_error(message + " " + ch);}
void throw_err(std::string message, int i) {throw std::runtime_error(message + ". Value: " + std::to_string(i));}
void throw_err(std::string message, double i) {throw std::runtime_error(message + ". Value: " + std::to_string(i));}

struct Reading {
    int day;
    int hour;
    double temperature;
};

struct Day {
    std::vector<double> hour {std::vector<double>(24, INVALID_READING)};
};

struct Month {
    int month = NOT_MONTH;  // 0-11 jan=0
    std::vector<Day> day {32};   // 1-31 one per day
};

struct Year {
    int year;
    std::vector<Month> month {12}; //jan=0
};

struct Collection {
    std::string location;
    std::vector<Year> years; //jan=0
};

std::istream& operator>>(std::istream& ist, Reading& r) {
    char ch_1;
    if(ist >> ch_1 && ch_1 != '(') {
        ist.unget();
        ist.clear(std::ios_base::failbit);
        return ist;
    }

    char ch_2;
    int d;
    int h;
    double temp;
    ist >> d >> h >> temp >> ch_2;
    
    if(!ist || ch_2 != ')') throw_err("bad reading", d);
    r.day = d;
    r.hour = h;
    r.temperature = temp;

    return ist;
}

bool is_valid(const Reading& r) {
    if(r.day < 1 || 31 < r.day) return false;
    if(r.hour < 0 || r.hour > 23) return false;
    if(r.temperature < implausible_min || r.temperature > implausible_max) return false;

    return true;
}

const std::vector<std::string> month_input_tbl = {
    "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"
};

const std::vector<std::string> month_output_tbl = {
    "January", "February", "March", "April",
    "May", "June", "July", "August", "September"
    "October", "November", "December"
};

int month_to_int(std::string month_name) {
    for(int i = 0; i < 12; i++) if(month_input_tbl[i] == month_name) return i;
    return -1;
}

std::string int_to_month(int i) {
    if(i < 0 || 12 <= i) throw std::runtime_error("Bad month index");
    return month_output_tbl[i];
}

std::istream& operator>>(std::istream& ist, Month& m) {
    char ch = 0;
    if(ist >> ch && ch != '{') {
        ist.unget();
        ist.clear(std::ios_base::failbit);
        return ist;
    }

    std::string month_marker;
    std::string mm;
    ist >> month_marker >> mm;
    if(!ist || month_marker != "month") throw std::runtime_error("Bad start of month");
    if (std::find(month_input_tbl.begin(), month_input_tbl.end(), mm) == month_input_tbl.end()) throw std::runtime_error("Month Not found");

    m.month = month_to_int(mm);

    int duplicates = 0;
    int invalids = 0;

    // 
    /* 
        month has day vector containing days as:
        {
            Day 1: {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
            Day 2: {-1, 67.6, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
        }
    
     */
    for(Reading r; ist >> r;) {
        if(is_valid(r)) {
            if(m.day[r.day].hour[r.hour] != INVALID_READING) ++duplicates;
            m.day[r.day].hour[r.hour] = r.temperature;
        } else {
            ++invalids;
        }
    }

    if(invalids > 0) throw std::runtime_error("Invalid Readings in month ");
    if(duplicates > 0) throw std::runtime_error("Duplicate Readings in month ");
    end_of_loop(ist, '}', "Month not terminated properly");

    return ist;
}

std::istream& operator>>(std::istream& ist, Year& y) {
    char ch;
    ist >> ch;
    if(ch != '{') {
        ist.unget();
        ist.clear(std::ios_base::failbit);
        return ist;
    }

    std::string year_marker;
    int yy;
    ist >> year_marker >> yy;
    if(!ist || year_marker != "year") throw std::runtime_error("Invalid Year start ");
    if(yy < min_year || yy > max_year) throw std::runtime_error("Year outside range");
        
    y.year = yy;

    /* 
        2010:
        {
            Jan
            {
                Day 1: {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
                Day 2: {-1, 67.6, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
            }
        
            Feb
            {
                Day 1: {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
                Day 2: {-1, 61, -1, -1, -1, -1, -1, -1, -1, -1} // 24 times
            }
        }
     */
    while(true) {
        Month m;
        if(!(ist >> m)) break;
        y.month[m.month] = m;
    }

    end_of_loop(ist, '}', "Invalid year end");
    return ist;
}

std::istream& operator>>(std::istream& ist, Collection& collection) {
    std::string start;
    ist >> start;
    if(start != "START") throw std::runtime_error("No 'START' identifier found");

    // mentioned location
    std::string city, country;
    char ch_1, ch_2;
    int zipcode;
    ist >> ch_1 >> city >> country >> zipcode >> ch_2;

    std::string location = city + + " " +std::to_string(zipcode) + ", " + country;
    const int zipsize = (zipcode <= 1) ? 1 : (log10(zipcode) + 1);

    if(ch_1 != '[' || ch_2 != ']' || !ist) {
        throw std::runtime_error("Invalid location format.");
    } else if(zipsize != 6) {
        throw std::runtime_error("Invalid zipcode");
    } else if(location.size() > 100) {
        throw std::runtime_error("Location name exceeds 100 characters");
    }
    
    std::cout << location;
    collection.location = location;

    while(true) {
        Year y;
        if(!(ist >> y)) break;
        for(Year yy : collection.years) if(yy.year == y.year) throw std::runtime_error("Year repeated ");
        collection.years.push_back(y);
    }
    
    if(ist.fail()) {
        ist.clear();
        std::string terminator;
        if(!(ist >> terminator && terminator == "END")) throw std::runtime_error("did not terminate collection with 'END' identifier");
    }
    
    return ist;
}

int main() {
    std::ifstream ifs {"ymd_temp_readings.txt"};
    if(!ifs) throw std::runtime_error("Could not open file");
    Collection readings{};
    try {
        ifs >> readings;
    } catch(std::runtime_error& e) {
        std::cerr << e.what();
    }
}




Tags


cpp data parsing programming code

Comments





Hello World 16 Aug 2021 (19:05)  (edited)

hmmm

REPLY
Salil M. 16 Sep 2021 (23:02) (edited)

@nullptr ok cool

REPLY
Salil M. 21 Nov 2021 (04:49) (edited)

@cmd05 lol

REPLY