#include <iostream>
#include <vector>
#include <cstring>
#include <algorithm>
#include <numeric>
#include <iomanip>
#include <functional>
class Dataframe {
public:
enum class Base : char
{
SIGNED = 'S',
UNSIGNED = 'U',
CHAR = 'A',
// and other types like floats, date, timestamp, etc.
};
class Dtype
{
public:
Dtype(Base base_dtype, std::size_t size) : m_base_dtype(base_dtype), m_size(size) {}
auto base_dtype() const { return m_base_dtype; }
auto size() const { return m_size; }
private:
Base m_base_dtype;
std::size_t m_size;
};
class Comparator {
public:
using comparator_fn = std::function<std::int8_t(const char*, const char*)>;
// create a function that returns a comparator functions for each type
static comparator_fn make_comparator(Dataframe::Base base_dtype, std::size_t size, std::size_t offset) {
switch (base_dtype) {
case Dataframe::Base::CHAR:
return [offset](const char *a, const char *b) {
return std::strcmp(a + offset, b + offset);
};
case Dataframe::Base::SIGNED:
return [offset](const char *a, const char *b) {
const auto v1 = *reinterpret_cast<const int*>(a + offset);
const auto v2 = *reinterpret_cast<const int*>(b + offset);
return (v1 < v2) ? -1 : (v1 > v2) ? 1 : 0;
};
case Dataframe::Base::UNSIGNED:
return [offset](const char *a, const char *b) {
const auto v1 = *reinterpret_cast<const unsigned int*>(a + offset);
const auto v2 = *reinterpret_cast<const unsigned int*>(b + offset);
return (v1 < v2) ? -1 : (v1 > v2) ? 1 : 0;
};
default:
throw std::runtime_error("Unknown base dtype");
}
}
// create a function that
static comparator_fn make_composite_comparator(const std::vector<Dataframe::Dtype> &dtypes) {
std::vector<comparator_fn> F;
std::size_t offset = 0;
for (auto &dtype : dtypes) {
F.push_back(make_comparator(dtype.base_dtype(), dtype.size(), offset));
offset += dtype.size();
}
return [F](const char *a, const char *b) {
for (auto &f : F) {
if (const int8_t result = f(a, b); result != 0)
return result < 0;
}
return false;
};
}
};
Dataframe(const std::vector<char> &raw_data, const std::vector<Dtype> &dtypes) : m_raw_data(raw_data), m_dtypes(dtypes)
{
cols_count = m_dtypes.size();
m_record_size = std::accumulate(m_dtypes.begin(), m_dtypes.end(), 0, [](std::size_t acc, const Dtype &dtype) {
return acc + dtype.size();
});
m_records_count = m_raw_data.size() / m_record_size;
std::cout << "Info:" << std::endl;
std::cout << " - cols count: " << cols_count << std::endl;
std::cout << " - record size: " << m_record_size << std::endl;
std::cout << " - records count: " << m_records_count << std::endl;
std::cout << std::endl;
}
void print() {
// cycle over the records
std::cout << "Dataframe:" << std::endl;
for (std::size_t i = 0; i < m_records_count; i++) {
// cycle over the dtypes (e.g. columns)
auto current_field = m_raw_data.data() + i * m_record_size;
for (std::size_t j = 0; j < m_dtypes.size(); j++) {
auto dtype = m_dtypes[j];
auto base_dtype = dtype.base_dtype();
auto size = dtype.size();
current_field += j > 0 ? m_dtypes[j - 1].size() : 0;
// print the data
switch (base_dtype) {
case Base::CHAR:
std::cout << std::setw(1) << *current_field << ";";
break;
case Base::SIGNED:
std::cout << std::setw(4) << *reinterpret_cast<int*>(current_field) << ";";
break;
case Base::UNSIGNED:
std::cout << std::setw(4) << *reinterpret_cast<unsigned int*>(current_field) << ";";
break;
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void sort() {
if (cols_count == 1) {
auto dtype = m_dtypes[0];
auto base_dtype = dtype.base_dtype();
auto size = dtype.size();
auto data = m_raw_data.data();
// print the data
switch (base_dtype) {
case Base::CHAR:
std::sort(data, data + m_raw_data.size());
break;
case Base::SIGNED:
std::sort((int*)data, (int*)data + m_raw_data.size() / size);
break;
case Base::UNSIGNED:
std::sort((unsigned int*)data, (unsigned int*)data + m_raw_data.size() / size);
break;
}
} else {
// This is a naive implementation, it works but it's not efficient
// use the ptr, compare and then swap the records
auto comparator = Comparator::make_composite_comparator(m_dtypes);
// create a set of pointers to the records
std::vector<char*> ptrs;
for (std::size_t i = 0; i < m_records_count; i++) {
ptrs.push_back(m_raw_data.data() + i * m_record_size);
}
// sort the pointers
std::sort(ptrs.begin(), ptrs.end(), comparator);
// apply pointers permutation
std::vector<char> tmp(m_raw_data.size());
for (std::size_t i = 0; i < m_records_count; i++) {
std::copy(ptrs[i], ptrs[i] + m_record_size, tmp.data() + i * m_record_size);
}
m_raw_data = tmp;
}
}
private:
std::vector<char> m_raw_data;
std::vector<Dtype> m_dtypes;
std::size_t m_records_count;
std::size_t m_record_size;
std::size_t cols_count;
};
void trivial_case_with_1_column() {
std::vector<char> raw;
int data[] = {8, 30, 8, 19, 7, 6, 10, 5, 3, 2, 1, 5};
raw.insert(raw.end(), (char*)data, (char*)data + sizeof(data));
Dataframe df(raw, {Dataframe::Dtype(Dataframe::Base::SIGNED, sizeof(int))});
df.print();
df.sort();
std::cout << "AFTER SORTING" << std::endl;
df.print();
}
void problematic_case_with_2_or_more_columns() {
std::vector<char> raw;
int column1[] = {8, 30, 8, 19, 7, 6, 10, 5, 3, 2, 1, 5};
char column2[] = {'z', 'a', 'a', 'b', 'c', 'd', 'e', 'x', 'g', 'h', 'i', 'a'};
for (std::size_t i = 0; i < sizeof(column1) / sizeof(int); i++) {
raw.insert(raw.end(), (char*)&column1[i], (char*)&column1[i] + sizeof(int));
raw.insert(raw.end(), (char*)&column2[i], (char*)&column2[i] + sizeof(char));
}
const auto dtypes = {Dataframe::Dtype(Dataframe::Base::SIGNED, sizeof(int)), Dataframe::Dtype(Dataframe::Base::CHAR, sizeof(char))};
Dataframe df(raw, dtypes);
df.print();
df.sort();
std::cout << "AFTER SORTING" << std::endl;
df.print();
}
int main(int argc, char const *argv[])
{
std::cout << "This case can be sorted by using a std::sort that casts the pointer to the correct type and it's efficient" << std::endl;
trivial_case_with_1_column();
std::cout << "----------------------------------------------------------------\n" << std::endl;
std::cout << "I don't know how to manage effectively this case" << std::endl;
problematic_case_with_2_or_more_columns();
return 0;
}