From 02d2b4854b3e877a2d0217a9ee724c6e69b9f50a Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 4 May 2023 11:04:40 -0400 Subject: [PATCH 001/121] Backup --- components/core/src/FileReaderSys.cpp | 200 ++++++++++++++++++++++++++ components/core/src/FileReaderSys.hpp | 138 ++++++++++++++++++ 2 files changed, 338 insertions(+) create mode 100644 components/core/src/FileReaderSys.cpp create mode 100644 components/core/src/FileReaderSys.hpp diff --git a/components/core/src/FileReaderSys.cpp b/components/core/src/FileReaderSys.cpp new file mode 100644 index 000000000..be172a996 --- /dev/null +++ b/components/core/src/FileReaderSys.cpp @@ -0,0 +1,200 @@ +#include "FileReaderSys.hpp" + +// Boost libraries +#include + +// C standard libraries +// C libraries +#include +#include + +// C++ standard libraries +#include + +// Project headers +#include + +using std::string; + +FileReaderSys::~FileReaderSys () { + close(); + m_read_buffer.release(); + free(m_getdelim_buf); +} + +ErrorCode FileReaderSys::try_get_pos (size_t& pos) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + + pos = m_file_pos; + return ErrorCode_Success; +} + +ErrorCode FileReaderSys::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + if (nullptr == buf) { + return ErrorCode_BadParam; + } + + size_t remaining_data = m_buffer_length - m_buffer_pos; + + if (num_bytes_to_read <= remaining_data) { + memcpy(buf, m_read_buffer.get() + m_buffer_pos, num_bytes_to_read); + // increment cursors + m_buffer_pos += num_bytes_to_read; + m_file_pos += num_bytes_to_read; + return ErrorCode_Success; + } else { + // first, read everything from buffer + size_t next_partial_read = num_bytes_to_read - remaining_data; + memcpy(buf, m_read_buffer.get() + m_buffer_pos, num_bytes_to_read); + + // refill the buffer + m_buffer_length = ::read(m_fd, m_read_buffer.get(), cReaderBufferSize); + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + if (m_buffer_length < next_partial_read) { + memcpy(buf + remaining_data, m_read_buffer.get(), m_buffer_length); + m_buffer_pos = m_buffer_length; + num_bytes_read = remaining_data + m_buffer_length; + m_file_pos += num_bytes_read; + return ErrorCode_EndOfFile; + } else { + memcpy(buf + remaining_data, m_read_buffer.get(), next_partial_read); + m_buffer_pos = next_partial_read; + num_bytes_read = num_bytes_to_read; + m_file_pos += num_bytes_read; + return ErrorCode_Success; + } + } +} + +ErrorCode FileReaderSys::try_seek_from_begin (size_t pos) { + if (m_fd == -1) { + return ErrorCode_NotInit; + } + if (pos > m_file_pos) { + auto offset = lseek(m_fd, pos, SEEK_SET); + if (offset != pos) { + return ErrorCode_errno; + } + m_buffer_length = 0; + m_buffer_pos = 0; + m_file_pos = pos; + } else { + // the maximum value we can seek back is m_buffer_pos; + auto seek_back_amount = m_file_pos - pos; + if (seek_back_amount > m_buffer_pos) { + SPDLOG_ERROR("Can back trace anymore"); + throw; + } else { + m_buffer_pos = m_buffer_pos - seek_back_amount; + m_file_pos = pos; + } + } + return ErrorCode_Success; +} + + +ErrorCode FileReaderSys::try_open (const string& path) { + // Cleanup in case caller forgot to call close before calling this function + close(); + + m_fd = ::open(path.c_str(), O_RDONLY); + if (-1 == m_fd) { + if (ENOENT == errno) { + return ErrorCode_FileNotFound; + } + return ErrorCode_errno; + } + m_path = path; + m_read_buffer = 0; + m_file_pos = 0; + m_buffer_length = 0; + + return ErrorCode_Success; +} + +ErrorCode FileReaderSys::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { + assert(-1 != m_fd); + + if (false == append) { + str.clear(); + } + + bool found_delim {false}; + + while (false == found_delim) { + auto cursor {m_buffer_pos}; + while (cursor < m_buffer_length) { + if (delim == m_read_buffer[cursor]) { + found_delim = true; + break; + } + cursor++; + } + if (found_delim) { + // append to strings + std::string_view substr {m_read_buffer.get() + m_buffer_pos, cursor - m_buffer_pos}; + str.append(substr); + // increase file pos + m_file_pos += cursor - m_buffer_pos; + m_buffer_pos = cursor; + } else { + // if we didn't find a delimiter, we append the current buffer to the str and + // read out a new buffer + std::string_view substr {m_read_buffer.get() + m_buffer_pos, m_buffer_length - m_buffer_pos}; + str.append(substr); + m_file_pos += m_buffer_length - m_buffer_pos; + m_buffer_pos = 0; + // refill the buffer + m_buffer_length = ::read(m_fd, m_read_buffer.get(), cReaderBufferSize); + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + if (m_buffer_length == 0) { + return ErrorCode_EndOfFile; + } + } + } + return ErrorCode_Success; +} + +void FileReaderSys::open (const string& path) { + ErrorCode error_code = try_open(path); + if (ErrorCode_Success != error_code) { + if (ErrorCode_FileNotFound == error_code) { + throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; + } else { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + } +} + +void FileReaderSys::close () { + if (-1 != m_fd) { + // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted + // by a signal + auto res = ::close(m_fd); + if (0 != res) { + throw "Not sure why close fail\n"; + } + m_fd = -1; + } +} + +ErrorCode FileReaderSys::try_fstat (struct stat& stat_buffer) { + if (-1 == m_fd) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + auto return_value = fstat(m_fd, &stat_buffer); + if (0 != return_value) { + return ErrorCode_errno; + } + return ErrorCode_Success; +} diff --git a/components/core/src/FileReaderSys.hpp b/components/core/src/FileReaderSys.hpp new file mode 100644 index 000000000..950225830 --- /dev/null +++ b/components/core/src/FileReaderSys.hpp @@ -0,0 +1,138 @@ +#ifndef FileReaderSys_HPP +#define FileReaderSys_HPP + +// C standard libraries + +// C++ libraries +#include +#include +#include + +// Project headers +#include "Defs.h" +#include "ErrorCode.hpp" +#include "ReaderInterface.hpp" +#include "TraceableException.hpp" + + +class FileReaderSys : ReaderInterface { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : TraceableException (error_code, filename, line_number) {} + + // Methods + const char* what () const noexcept override { + return "FileReader operation failed"; + } + }; + + // Constructors + FileReaderSys() : m_file_pos(0), m_buffer_pos(0), m_fd(-1) { + m_read_buffer = std::make_unique(cReaderBufferSize); + } + ~FileReaderSys(); + // Methods implementing the ReaderInterface + /** + * Tries to get the current position of the read head in the file + * @param pos Position of the read head in the file + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + ErrorCode try_get_pos (size_t& pos) override; + /** + * Tries to seek from the beginning of the file to the given position + * @param pos + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + ErrorCode try_seek_from_begin (size_t pos) override; + + /** + * Tries to read up to a given number of bytes from the file + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_BadParam if buf is invalid + * @return ErrorCode_errno on error + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Success on success + */ + ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + + /** + * Tries to read a string from the file until it reaches the specified delimiter + * @param delim The delimiter to stop at + * @param keep_delimiter Whether to include the delimiter in the output string or not + * @param append Whether to append to the given string or replace its contents + * @param str The string read + * @return ErrorCode_Success on success + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_errno otherwise + */ + ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; + + // Methods + bool is_open () const { return -1 != m_fd; } + + /** + * Tries to open a file + * @param path + * @return ErrorCode_Success on success + * @return ErrorCode_FileNotFound if the file was not found + * @return ErrorCode_errno otherwise + */ + ErrorCode try_open (const std::string& path); + /** + * Opens a file + * @param path + * @throw FileReader::OperationFailed on failure + */ + void open (const std::string& path); + /** + * Closes the file if it's open + */ + void close (); + + [[nodiscard]] const std::string& get_path () const { return m_path; } + + /** + * Tries to stat the current file + * @param stat_buffer + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + ErrorCode try_fstat (struct stat& stat_buffer); + + +private: + // Types + size_t m_file_pos; + ssize_t m_buffer_length; + size_t m_buffer_pos; + std::unique_ptr m_read_buffer; + int m_fd; + std::string m_path; + size_t m_getdelim_buf_len; + char* m_getdelim_buf; + + static constexpr size_t cReaderBufferSize = 65536; + // Constants + + // Factory functions + + // Assignment operators + + // Methods + + // Variables + +}; + + +#endif // FileReaderSys_HPP From 21581c04441aa217aa4786aa92fc029414bd3668 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 5 May 2023 15:29:45 -0400 Subject: [PATCH 002/121] simple working version backup --- components/core/src/FileReader.cpp | 227 +++++++++++++++++++------- components/core/src/FileReader.hpp | 44 +++-- components/core/src/FileReaderSys.cpp | 200 ----------------------- components/core/src/FileReaderSys.hpp | 138 ---------------- 4 files changed, 200 insertions(+), 409 deletions(-) delete mode 100644 components/core/src/FileReaderSys.cpp delete mode 100644 components/core/src/FileReaderSys.hpp diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index f1b740d8b..4f8bd23c2 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -4,90 +4,214 @@ #include // C standard libraries -#include +// C libraries #include -#include +#include -// C++ libraries -#include +// C++ standard libraries #include +// Project headers +#include + using std::string; FileReader::~FileReader () { close(); - free(m_getdelim_buf); + free(m_read_buffer); +} + +ErrorCode FileReader::try_get_pos (size_t& pos) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + + pos = m_file_pos; + return ErrorCode_Success; } ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (nullptr == m_file) { + if (-1 == m_fd) { return ErrorCode_NotInit; } if (nullptr == buf) { return ErrorCode_BadParam; } - - num_bytes_read = fread(buf, sizeof(*buf), num_bytes_to_read, m_file); - if (num_bytes_read < num_bytes_to_read) { - if (ferror(m_file)) { + if (started_reading == false) { + started_reading = true; + auto offset = lseek(m_fd, m_file_pos, SEEK_SET); + if (offset != m_file_pos) { return ErrorCode_errno; - } else if (feof(m_file)) { - if (0 == num_bytes_read) { + } + + } + size_t remaining_data = m_buffer_length - m_buffer_pos; + + if (num_bytes_to_read <= remaining_data) { + memcpy(buf, m_read_buffer + m_buffer_pos, num_bytes_to_read); + // increment cursors + m_buffer_pos += num_bytes_to_read; + m_file_pos += num_bytes_to_read; + num_bytes_read = num_bytes_to_read; + return ErrorCode_Success; + } else { + // first, read everything from buffer + size_t next_partial_read = num_bytes_to_read - remaining_data; + memcpy(buf, m_read_buffer + m_buffer_pos, remaining_data); + + if (reached_eof) { + m_file_pos += remaining_data; + m_buffer_pos += m_buffer_length; + num_bytes_read = remaining_data; + if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } + return ErrorCode_Success; } - } - return ErrorCode_Success; + // refill the buffer + m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + if (m_buffer_length < cReaderBufferSize) { + reached_eof = true; + } + if (m_buffer_length < next_partial_read) { + memcpy(buf + remaining_data, m_read_buffer, m_buffer_length); + m_buffer_pos = m_buffer_length; + num_bytes_read = remaining_data + m_buffer_length; + m_file_pos += num_bytes_read; + return ErrorCode_Success; + } else { + memcpy(buf + remaining_data, m_read_buffer, next_partial_read); + m_buffer_pos = next_partial_read; + num_bytes_read = num_bytes_to_read; + m_file_pos += num_bytes_read; + return ErrorCode_Success; + } + } } +// Maybe everytime, I should always read a page? ErrorCode FileReader::try_seek_from_begin (size_t pos) { - if (nullptr == m_file) { + printf("try seek on fd %d\n", m_fd); + if (m_fd == -1) { return ErrorCode_NotInit; } - - int retval = fseeko(m_file, pos, SEEK_SET); - if (0 != retval) { - return ErrorCode_errno; + if (pos > m_file_pos) { + auto front_seek_amount = pos - m_file_pos; + if (front_seek_amount > m_buffer_length - m_buffer_pos) { + // if the seek-to pos is out of buffer + printf("Seek front on %d\n", m_fd); + m_buffer_length = 0; + m_buffer_pos = 0; + m_file_pos = pos; + } else { + // otherwise, we can simply + printf("simple seek front on %d\n", m_fd); + m_buffer_pos += front_seek_amount; + m_file_pos = pos; + } + } else { + printf("Seek back on %d\n", m_fd); + // the maximum value we can seek back is m_buffer_pos; + if(started_reading == false) { + m_file_pos = pos; + } else { + auto seek_back_amount = m_file_pos - pos; + if (seek_back_amount > m_buffer_pos) { + SPDLOG_ERROR("Can't back trace anymore"); + throw; + } else { + m_buffer_pos = m_buffer_pos - seek_back_amount; + m_file_pos = pos; + } + } } - return ErrorCode_Success; } -ErrorCode FileReader::try_get_pos (size_t& pos) { - if (nullptr == m_file) { - return ErrorCode_NotInit; - } - - pos = ftello(m_file); - if ((off_t)-1 == pos) { - return ErrorCode_errno; - } - - return ErrorCode_Success; -} ErrorCode FileReader::try_open (const string& path) { // Cleanup in case caller forgot to call close before calling this function close(); - m_file = fopen(path.c_str(), "rb"); - if (nullptr == m_file) { + m_fd = ::open(path.c_str(), O_RDONLY); + if (-1 == m_fd) { if (ENOENT == errno) { return ErrorCode_FileNotFound; } return ErrorCode_errno; } m_path = path; + m_buffer_pos = 0; + m_file_pos = 0; + // If I open here, later I may get eof error, so I can not open here + // so early read might not be a good idea + m_buffer_length = 0; + reached_eof = false; + started_reading = false; return ErrorCode_Success; } +ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { + assert(-1 != m_fd); + + if (false == append) { + str.clear(); + } + + bool found_delim {false}; + + while (false == found_delim) { + auto cursor {m_buffer_pos}; + while (cursor < m_buffer_length) { + if (delim == m_read_buffer[cursor]) { + found_delim = true; + break; + } + cursor++; + } + if (found_delim) { + // append to strings + std::string_view substr {m_read_buffer + m_buffer_pos, cursor + 1 - m_buffer_pos}; + str.append(substr); + // increase file pos + m_file_pos += (cursor + 1) - m_buffer_pos; + m_buffer_pos = cursor + 1; + } else { + // if we didn't find a delimiter, we append the current buffer to the str and + // read out a new buffer + std::string_view substr {m_read_buffer + m_buffer_pos, m_buffer_length - m_buffer_pos}; + str.append(substr); + // refill the buffer + if (reached_eof) { + return ErrorCode_EndOfFile; + } + m_file_pos += m_buffer_length - m_buffer_pos; + m_buffer_pos = 0; + m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (m_buffer_length < cReaderBufferSize) { + reached_eof = true; + } + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + if (m_buffer_length == 0) { + return ErrorCode_EndOfFile; + } + } + } + return ErrorCode_Success; +} + void FileReader::open (const string& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; + throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; } else { throw OperationFailed(error_code, __FILENAME__, __LINE__); } @@ -95,42 +219,23 @@ void FileReader::open (const string& path) { } void FileReader::close () { - if (m_file != nullptr) { + if (-1 != m_fd) { // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted // by a signal - fclose(m_file); - m_file = nullptr; - } -} - -ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { - assert(nullptr != m_file); - - if (false == append) { - str.clear(); - } - ssize_t num_bytes_read = getdelim(&m_getdelim_buf, &m_getdelim_buf_len, delim, m_file); - if (num_bytes_read < 1) { - if (ferror(m_file)) { - return ErrorCode_errno; - } else if (feof(m_file)) { - return ErrorCode_EndOfFile; + auto res = ::close(m_fd); + if (0 != res) { + throw "Not sure why close fail\n"; } + m_fd = -1; } - if (false == keep_delimiter && delim == m_getdelim_buf[num_bytes_read - 1]) { - --num_bytes_read; - } - str.append(m_getdelim_buf, num_bytes_read); - - return ErrorCode_Success; } ErrorCode FileReader::try_fstat (struct stat& stat_buffer) { - if (nullptr == m_file) { + if (-1 == m_fd) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - auto return_value = fstat(fileno(m_file), &stat_buffer); + auto return_value = fstat(m_fd, &stat_buffer); if (0 != return_value) { return ErrorCode_errno; } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 6347a0bea..801e8c3d8 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -1,8 +1,11 @@ -#ifndef FILEREADER_HPP -#define FILEREADER_HPP +#ifndef FileReaderSys_HPP +#define FileReaderSys_HPP + +// C standard libraries // C++ libraries #include +#include #include // Project headers @@ -11,6 +14,7 @@ #include "ReaderInterface.hpp" #include "TraceableException.hpp" + class FileReader : public ReaderInterface { public: // Types @@ -25,9 +29,11 @@ class FileReader : public ReaderInterface { } }; - FileReader () : m_file(nullptr), m_getdelim_buf_len(0), m_getdelim_buf(nullptr) {} - ~FileReader (); - + // Constructors + FileReader() : m_file_pos(0), m_buffer_pos(0), m_fd(-1) { + m_read_buffer = (char*)malloc(sizeof(char) * cReaderBufferSize); + } + ~FileReader(); // Methods implementing the ReaderInterface /** * Tries to get the current position of the read head in the file @@ -72,7 +78,7 @@ class FileReader : public ReaderInterface { ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; // Methods - bool is_open () const { return m_file != nullptr; } + bool is_open () const { return -1 != m_fd; } /** * Tries to open a file @@ -103,11 +109,29 @@ class FileReader : public ReaderInterface { */ ErrorCode try_fstat (struct stat& stat_buffer); + private: - FILE* m_file; - size_t m_getdelim_buf_len; - char* m_getdelim_buf; + // Types + size_t m_file_pos; + ssize_t m_buffer_length; + size_t m_buffer_pos; + char* m_read_buffer; + int m_fd; std::string m_path; + bool reached_eof; + bool started_reading; + static constexpr size_t cReaderBufferSize = 1024; + // Constants + + // Factory functions + + // Assignment operators + + // Methods + + // Variables + }; -#endif // FILEREADER_HPP + +#endif // FileReaderSys_HPP diff --git a/components/core/src/FileReaderSys.cpp b/components/core/src/FileReaderSys.cpp deleted file mode 100644 index be172a996..000000000 --- a/components/core/src/FileReaderSys.cpp +++ /dev/null @@ -1,200 +0,0 @@ -#include "FileReaderSys.hpp" - -// Boost libraries -#include - -// C standard libraries -// C libraries -#include -#include - -// C++ standard libraries -#include - -// Project headers -#include - -using std::string; - -FileReaderSys::~FileReaderSys () { - close(); - m_read_buffer.release(); - free(m_getdelim_buf); -} - -ErrorCode FileReaderSys::try_get_pos (size_t& pos) { - if (-1 == m_fd) { - return ErrorCode_NotInit; - } - - pos = m_file_pos; - return ErrorCode_Success; -} - -ErrorCode FileReaderSys::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (-1 == m_fd) { - return ErrorCode_NotInit; - } - if (nullptr == buf) { - return ErrorCode_BadParam; - } - - size_t remaining_data = m_buffer_length - m_buffer_pos; - - if (num_bytes_to_read <= remaining_data) { - memcpy(buf, m_read_buffer.get() + m_buffer_pos, num_bytes_to_read); - // increment cursors - m_buffer_pos += num_bytes_to_read; - m_file_pos += num_bytes_to_read; - return ErrorCode_Success; - } else { - // first, read everything from buffer - size_t next_partial_read = num_bytes_to_read - remaining_data; - memcpy(buf, m_read_buffer.get() + m_buffer_pos, num_bytes_to_read); - - // refill the buffer - m_buffer_length = ::read(m_fd, m_read_buffer.get(), cReaderBufferSize); - if (m_buffer_length == -1) { - return ErrorCode_errno; - } - if (m_buffer_length < next_partial_read) { - memcpy(buf + remaining_data, m_read_buffer.get(), m_buffer_length); - m_buffer_pos = m_buffer_length; - num_bytes_read = remaining_data + m_buffer_length; - m_file_pos += num_bytes_read; - return ErrorCode_EndOfFile; - } else { - memcpy(buf + remaining_data, m_read_buffer.get(), next_partial_read); - m_buffer_pos = next_partial_read; - num_bytes_read = num_bytes_to_read; - m_file_pos += num_bytes_read; - return ErrorCode_Success; - } - } -} - -ErrorCode FileReaderSys::try_seek_from_begin (size_t pos) { - if (m_fd == -1) { - return ErrorCode_NotInit; - } - if (pos > m_file_pos) { - auto offset = lseek(m_fd, pos, SEEK_SET); - if (offset != pos) { - return ErrorCode_errno; - } - m_buffer_length = 0; - m_buffer_pos = 0; - m_file_pos = pos; - } else { - // the maximum value we can seek back is m_buffer_pos; - auto seek_back_amount = m_file_pos - pos; - if (seek_back_amount > m_buffer_pos) { - SPDLOG_ERROR("Can back trace anymore"); - throw; - } else { - m_buffer_pos = m_buffer_pos - seek_back_amount; - m_file_pos = pos; - } - } - return ErrorCode_Success; -} - - -ErrorCode FileReaderSys::try_open (const string& path) { - // Cleanup in case caller forgot to call close before calling this function - close(); - - m_fd = ::open(path.c_str(), O_RDONLY); - if (-1 == m_fd) { - if (ENOENT == errno) { - return ErrorCode_FileNotFound; - } - return ErrorCode_errno; - } - m_path = path; - m_read_buffer = 0; - m_file_pos = 0; - m_buffer_length = 0; - - return ErrorCode_Success; -} - -ErrorCode FileReaderSys::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { - assert(-1 != m_fd); - - if (false == append) { - str.clear(); - } - - bool found_delim {false}; - - while (false == found_delim) { - auto cursor {m_buffer_pos}; - while (cursor < m_buffer_length) { - if (delim == m_read_buffer[cursor]) { - found_delim = true; - break; - } - cursor++; - } - if (found_delim) { - // append to strings - std::string_view substr {m_read_buffer.get() + m_buffer_pos, cursor - m_buffer_pos}; - str.append(substr); - // increase file pos - m_file_pos += cursor - m_buffer_pos; - m_buffer_pos = cursor; - } else { - // if we didn't find a delimiter, we append the current buffer to the str and - // read out a new buffer - std::string_view substr {m_read_buffer.get() + m_buffer_pos, m_buffer_length - m_buffer_pos}; - str.append(substr); - m_file_pos += m_buffer_length - m_buffer_pos; - m_buffer_pos = 0; - // refill the buffer - m_buffer_length = ::read(m_fd, m_read_buffer.get(), cReaderBufferSize); - if (m_buffer_length == -1) { - return ErrorCode_errno; - } - if (m_buffer_length == 0) { - return ErrorCode_EndOfFile; - } - } - } - return ErrorCode_Success; -} - -void FileReaderSys::open (const string& path) { - ErrorCode error_code = try_open(path); - if (ErrorCode_Success != error_code) { - if (ErrorCode_FileNotFound == error_code) { - throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; - } else { - throw OperationFailed(error_code, __FILENAME__, __LINE__); - } - } -} - -void FileReaderSys::close () { - if (-1 != m_fd) { - // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted - // by a signal - auto res = ::close(m_fd); - if (0 != res) { - throw "Not sure why close fail\n"; - } - m_fd = -1; - } -} - -ErrorCode FileReaderSys::try_fstat (struct stat& stat_buffer) { - if (-1 == m_fd) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - auto return_value = fstat(m_fd, &stat_buffer); - if (0 != return_value) { - return ErrorCode_errno; - } - return ErrorCode_Success; -} diff --git a/components/core/src/FileReaderSys.hpp b/components/core/src/FileReaderSys.hpp deleted file mode 100644 index 950225830..000000000 --- a/components/core/src/FileReaderSys.hpp +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef FileReaderSys_HPP -#define FileReaderSys_HPP - -// C standard libraries - -// C++ libraries -#include -#include -#include - -// Project headers -#include "Defs.h" -#include "ErrorCode.hpp" -#include "ReaderInterface.hpp" -#include "TraceableException.hpp" - - -class FileReaderSys : ReaderInterface { -public: - // Types - class OperationFailed : public TraceableException { - public: - // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : TraceableException (error_code, filename, line_number) {} - - // Methods - const char* what () const noexcept override { - return "FileReader operation failed"; - } - }; - - // Constructors - FileReaderSys() : m_file_pos(0), m_buffer_pos(0), m_fd(-1) { - m_read_buffer = std::make_unique(cReaderBufferSize); - } - ~FileReaderSys(); - // Methods implementing the ReaderInterface - /** - * Tries to get the current position of the read head in the file - * @param pos Position of the read head in the file - * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_errno on error - * @return ErrorCode_Success on success - */ - ErrorCode try_get_pos (size_t& pos) override; - /** - * Tries to seek from the beginning of the file to the given position - * @param pos - * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_errno on error - * @return ErrorCode_Success on success - */ - ErrorCode try_seek_from_begin (size_t pos) override; - - /** - * Tries to read up to a given number of bytes from the file - * @param buf - * @param num_bytes_to_read The number of bytes to try and read - * @param num_bytes_read The actual number of bytes read - * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_BadParam if buf is invalid - * @return ErrorCode_errno on error - * @return ErrorCode_EndOfFile on EOF - * @return ErrorCode_Success on success - */ - ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; - - /** - * Tries to read a string from the file until it reaches the specified delimiter - * @param delim The delimiter to stop at - * @param keep_delimiter Whether to include the delimiter in the output string or not - * @param append Whether to append to the given string or replace its contents - * @param str The string read - * @return ErrorCode_Success on success - * @return ErrorCode_EndOfFile on EOF - * @return ErrorCode_errno otherwise - */ - ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; - - // Methods - bool is_open () const { return -1 != m_fd; } - - /** - * Tries to open a file - * @param path - * @return ErrorCode_Success on success - * @return ErrorCode_FileNotFound if the file was not found - * @return ErrorCode_errno otherwise - */ - ErrorCode try_open (const std::string& path); - /** - * Opens a file - * @param path - * @throw FileReader::OperationFailed on failure - */ - void open (const std::string& path); - /** - * Closes the file if it's open - */ - void close (); - - [[nodiscard]] const std::string& get_path () const { return m_path; } - - /** - * Tries to stat the current file - * @param stat_buffer - * @return ErrorCode_errno on error - * @return ErrorCode_Success on success - */ - ErrorCode try_fstat (struct stat& stat_buffer); - - -private: - // Types - size_t m_file_pos; - ssize_t m_buffer_length; - size_t m_buffer_pos; - std::unique_ptr m_read_buffer; - int m_fd; - std::string m_path; - size_t m_getdelim_buf_len; - char* m_getdelim_buf; - - static constexpr size_t cReaderBufferSize = 65536; - // Constants - - // Factory functions - - // Assignment operators - - // Methods - - // Variables - -}; - - -#endif // FileReaderSys_HPP From 5ff962e159d98166058108b6ced49b27543be0b3 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 6 May 2023 16:35:18 -0400 Subject: [PATCH 003/121] backup --- components/core/src/FileReader.cpp | 58 ++++++++++++++++++------------ 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 4f8bd23c2..10e5d7f80 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -53,11 +53,12 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num m_buffer_pos += num_bytes_to_read; m_file_pos += num_bytes_to_read; num_bytes_read = num_bytes_to_read; - return ErrorCode_Success; } else { + // else if data is not enough. // first, read everything from buffer size_t next_partial_read = num_bytes_to_read - remaining_data; memcpy(buf, m_read_buffer + m_buffer_pos, remaining_data); + num_bytes_read = remaining_data; if (reached_eof) { m_file_pos += remaining_data; @@ -69,36 +70,49 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num return ErrorCode_Success; } - // refill the buffer - m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); - if (m_buffer_length == -1) { - return ErrorCode_errno; - } - if (m_buffer_length < cReaderBufferSize) { - reached_eof = true; - } - if (m_buffer_length < next_partial_read) { - memcpy(buf + remaining_data, m_read_buffer, m_buffer_length); - m_buffer_pos = m_buffer_length; - num_bytes_read = remaining_data + m_buffer_length; - m_file_pos += num_bytes_read; - return ErrorCode_Success; - } else { - memcpy(buf + remaining_data, m_read_buffer, next_partial_read); - m_buffer_pos = next_partial_read; - num_bytes_read = num_bytes_to_read; - m_file_pos += num_bytes_read; - return ErrorCode_Success; + bool finish_reading = false; + while (false == finish_reading) { + // refill the buffer + m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + if (m_buffer_length < cReaderBufferSize) { + reached_eof = true; + } + if (m_buffer_length >= next_partial_read) { + memcpy(buf + num_bytes_read, m_read_buffer, next_partial_read); + m_buffer_pos = next_partial_read; + num_bytes_read += next_partial_read; + m_file_pos += num_bytes_read; + finish_reading = true; + } else { + // m_buffer_length < next_partial_read + memcpy(buf + num_bytes_read, m_read_buffer, m_buffer_length); + num_bytes_read += m_buffer_length; + m_file_pos += num_bytes_read; + next_partial_read -= m_buffer_length; + if (reached_eof) { + finish_reading = true; + } + } } } + return ErrorCode_Success; } // Maybe everytime, I should always read a page? ErrorCode FileReader::try_seek_from_begin (size_t pos) { - printf("try seek on fd %d\n", m_fd); if (m_fd == -1) { return ErrorCode_NotInit; } + struct stat st; + fstat(m_fd, &st); + off_t size = st.st_size; + if (pos >= size) { + return ErrorCode_EndOfFile; + } + if (pos > m_file_pos) { auto front_seek_amount = pos - m_file_pos; if (front_seek_amount > m_buffer_length - m_buffer_pos) { From 791cfeed9239f049d74a8c3099d4f3b1c956bcd4 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 7 May 2023 20:47:40 -0400 Subject: [PATCH 004/121] Allow seeking beyond end of file --- components/core/src/FileReader.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 10e5d7f80..efd51ce9e 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -106,12 +106,13 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { if (m_fd == -1) { return ErrorCode_NotInit; } - struct stat st; - fstat(m_fd, &st); - off_t size = st.st_size; - if (pos >= size) { - return ErrorCode_EndOfFile; - } + //TODO: do we need to detect out of range seek? +// struct stat st; +// fstat(m_fd, &st); +// off_t size = st.st_size; +// if (pos >= size) { +// return ErrorCode_EndOfFile; +// } if (pos > m_file_pos) { auto front_seek_amount = pos - m_file_pos; From d542e471b39cce058cfbb6fd3554ab9f050d8b85 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 7 May 2023 21:23:53 -0400 Subject: [PATCH 005/121] Use BufferReader for Decoder --- components/core/CMakeLists.txt | 2 + components/core/src/BufferReader.cpp | 67 +++++++++++ components/core/src/BufferReader.hpp | 56 +++++++++ .../src/ffi/ir_stream/decoding_methods.cpp | 108 +++++++----------- .../src/ffi/ir_stream/decoding_methods.hpp | 82 +------------ .../core/tests/test-ir_encoding_methods.cpp | 66 ++++++----- 6 files changed, 211 insertions(+), 170 deletions(-) create mode 100644 components/core/src/BufferReader.cpp create mode 100644 components/core/src/BufferReader.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 8d64bc07b..cce526d73 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -824,6 +824,8 @@ set(SOURCE_FILES_unitTest tests/test-string_utils.cpp tests/test-TimestampPattern.cpp tests/test-Utils.cpp + src/BufferReader.cpp + src/BufferReader.hpp ) add_executable(unitTest ${SOURCE_FILES_unitTest}) target_link_libraries(unitTest diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp new file mode 100644 index 000000000..329f08514 --- /dev/null +++ b/components/core/src/BufferReader.cpp @@ -0,0 +1,67 @@ +#include "BufferReader.hpp" + +// C++ standard libraries +#include + +// Project headers +#include "spdlog/spdlog.h" + +using std::string_view; + +[[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { + if (nullptr == m_buffer) { + return ErrorCode_NotInit; + } + pos = m_cursor_pos; + return ErrorCode_Success; +} + +[[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { + if (nullptr == m_buffer) { + return ErrorCode_NotInit; + } + // TODO: should we throw the error? + if (pos > m_size) { + return ErrorCode_OutOfBounds; + } + m_cursor_pos = pos; + return ErrorCode_Success; +} + +ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + + if (nullptr == m_buffer) { + return ErrorCode_NotInit; + } + if (nullptr == buf) { + return ErrorCode_BadParam; + } + + if (m_cursor_pos >= m_size) { + return ErrorCode_EndOfFile; + } + + ErrorCode error_code; + auto data_available = m_size - m_cursor_pos; + if (data_available < num_bytes_to_read) { + memcpy(buf, m_buffer + m_cursor_pos, data_available); + num_bytes_read = data_available; + error_code = ErrorCode_EndOfFile; + } else { + memcpy(buf, m_buffer + m_cursor_pos, num_bytes_to_read); + num_bytes_read = num_bytes_to_read; + error_code = ErrorCode_Success; + } + m_cursor_pos += num_bytes_read; + return error_code; +} + +bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size) { + if ((m_cursor_pos + read_size) > m_size) { + return false; + } + str_view = string_view(reinterpret_cast(m_buffer + m_cursor_pos), + read_size); + m_cursor_pos += read_size; + return true; +} \ No newline at end of file diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp new file mode 100644 index 000000000..50d6d8222 --- /dev/null +++ b/components/core/src/BufferReader.hpp @@ -0,0 +1,56 @@ + +#ifndef BufferReader_HPP +#define BufferReader_HPP + +// C standard libraries + +// C++ standard libraries + +// Project headers +#include "ReaderInterface.hpp" + +class BufferReader : public ReaderInterface { +public: + BufferReader () : m_buffer(nullptr), + m_size(0), + m_cursor_pos(0), + m_checkpoint_pos(0) {} + BufferReader (const int8_t* data, size_t size) : + m_buffer(data), + m_size(size), + m_cursor_pos(0), + m_checkpoint_pos(0) {} + + [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) override; + [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + + void reset_buffer (const int8_t* data, size_t size) { + m_buffer = data; + m_size = size; + } + + // The following methods should only be used by the decoder + void mark_pos () { m_checkpoint_pos = m_cursor_pos; } + void revert_pos () { m_cursor_pos = m_checkpoint_pos; } + + /** + * Tries reading a string view of size = read_size from the ir_buf. + * @param str_view Returns the string view + * @param read_size + * @return true on success, false if the ir_buf doesn't contain enough + * data to decode + **/ + [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); + +private: + + const int8_t* m_buffer; + size_t m_size; + size_t m_cursor_pos; + size_t m_checkpoint_pos; +}; + + +#endif // BufferReader_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 7ebafd58d..1a3742636 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -29,7 +29,7 @@ namespace ffi::ir_stream { * to decode */ template - static bool decode_int (IrBuffer& ir_buf, integer_t& value); + static bool decode_int (BufferReader& ir_buf, integer_t& value); /** * Decodes the next logtype string from ir_buf @@ -41,7 +41,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode parse_logtype (IrBuffer& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_logtype (BufferReader& ir_buf, encoded_tag_t encoded_tag, string_view& logtype); /** @@ -54,7 +54,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (IrBuffer& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_dictionary_var (BufferReader& ir_buf, encoded_tag_t encoded_tag, string_view& dict_var); /** @@ -71,7 +71,7 @@ namespace ffi::ir_stream { * to decode */ template - IRErrorCode parse_timestamp (IrBuffer& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); + IRErrorCode parse_timestamp (BufferReader& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); /** * Decodes the next encoded message from ir_buf @@ -89,7 +89,7 @@ namespace ffi::ir_stream { * to decode */ template - static IRErrorCode generic_decode_next_message (IrBuffer& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp); /** @@ -103,7 +103,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode read_metadata_info (IrBuffer& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (BufferReader& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size); /** @@ -125,30 +125,6 @@ namespace ffi::ir_stream { const vector& dictionary_vars ); - bool IrBuffer::try_read (string_view& str_view, size_t read_size) { - if (read_will_overflow(read_size)) { - return false; - } - str_view = string_view(reinterpret_cast(m_data + m_internal_cursor_pos), - read_size); - m_internal_cursor_pos += read_size; - return true; - } - - template - bool IrBuffer::try_read (integer_t& data) { - return try_read(&data, sizeof(data)); - } - - bool IrBuffer::try_read (void* dest, size_t read_size) { - if (read_will_overflow(read_size)) { - return false; - } - memcpy(dest, (m_data + m_internal_cursor_pos), read_size); - m_internal_cursor_pos += read_size; - return true; - } - template static bool is_variable_tag (encoded_tag_t tag, bool& is_encoded_var) { static_assert(is_same_v || @@ -176,9 +152,9 @@ namespace ffi::ir_stream { } template - static bool decode_int (IrBuffer& ir_buf, integer_t& value) { + static bool decode_int (BufferReader& ir_buf, integer_t& value) { integer_t value_small_endian; - if (ir_buf.try_read(value_small_endian) == false) { + if (ir_buf.try_read_numeric_value(value_small_endian) != ErrorCode_Success) { return false; } @@ -196,7 +172,7 @@ namespace ffi::ir_stream { return true; } - static IRErrorCode parse_logtype (IrBuffer& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_logtype (BufferReader& ir_buf, encoded_tag_t encoded_tag, string_view& logtype) { size_t logtype_length; @@ -222,13 +198,13 @@ namespace ffi::ir_stream { return IRErrorCode_Corrupted_IR; } - if (ir_buf.try_read(logtype, logtype_length) == false) { + if (ir_buf.try_read_string_view(logtype, logtype_length) == false) { return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; } - static IRErrorCode parse_dictionary_var (IrBuffer& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_dictionary_var (BufferReader& ir_buf, encoded_tag_t encoded_tag, string_view& dict_var) { // Decode variable's length size_t var_length; @@ -255,7 +231,7 @@ namespace ffi::ir_stream { } // Read the dictionary variable - if (false == ir_buf.try_read(dict_var, var_length)) { + if (false == ir_buf.try_read_string_view(dict_var, var_length)) { return IRErrorCode_Incomplete_IR; } @@ -263,7 +239,7 @@ namespace ffi::ir_stream { } template - IRErrorCode parse_timestamp (IrBuffer& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) + IRErrorCode parse_timestamp (BufferReader& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { static_assert(is_same_v || is_same_v); @@ -302,13 +278,11 @@ namespace ffi::ir_stream { } template - static IRErrorCode generic_decode_next_message (IrBuffer& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp) { - ir_buf.init_internal_pos(); - encoded_tag_t encoded_tag; - if (false == ir_buf.try_read(encoded_tag)) { + if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } if (cProtocol::Eof == encoded_tag) { @@ -335,7 +309,7 @@ namespace ffi::ir_stream { } dict_vars.emplace_back(var_str); } - if (false == ir_buf.try_read(encoded_tag)) { + if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } } @@ -351,7 +325,7 @@ namespace ffi::ir_stream { // NOTE: for the eight-byte encoding, the timestamp is the actual // timestamp; for the four-byte encoding, the timestamp is a timestamp // delta - if (false == ir_buf.try_read(encoded_tag)) { + if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } if (auto error_code = parse_timestamp(ir_buf, encoded_tag, timestamp); @@ -364,20 +338,18 @@ namespace ffi::ir_stream { } catch (const EncodingException& e) { return IRErrorCode_Decode_Error; } - - ir_buf.commit_internal_pos(); return IRErrorCode_Success; } - static IRErrorCode read_metadata_info (IrBuffer& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (BufferReader& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size) { - if (false == ir_buf.try_read(metadata_type)) { + if (ErrorCode_Success != ir_buf.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; } // Read metadata length encoded_tag_t encoded_tag; - if (false == ir_buf.try_read(encoded_tag)) { + if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } switch (encoded_tag) { @@ -490,11 +462,14 @@ namespace ffi::ir_stream { return message; } - IRErrorCode get_encoding_type (IrBuffer& ir_buf, bool& is_four_bytes_encoding) { - ir_buf.init_internal_pos(); + IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding) { + ir_buf.mark_pos(); - int8_t buffer[cProtocol::MagicNumberLength]; - if (false == ir_buf.try_read(buffer, cProtocol::MagicNumberLength)) { + char buffer[cProtocol::MagicNumberLength]; + size_t num_bytes_read; + if (ErrorCode_Success != ir_buf.try_read(buffer, cProtocol::MagicNumberLength, + num_bytes_read)) { + ir_buf.revert_pos(); return IRErrorCode_Incomplete_IR; } if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, @@ -504,35 +479,33 @@ namespace ffi::ir_stream { cProtocol::MagicNumberLength)) { is_four_bytes_encoding = false; } else { + ir_buf.revert_pos(); return IRErrorCode_Corrupted_IR; } - ir_buf.commit_internal_pos(); return IRErrorCode_Success; } - IRErrorCode decode_preamble (IrBuffer& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { - ir_buf.init_internal_pos(); + ir_buf.mark_pos(); if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); error_code != IRErrorCode_Success) { + ir_buf.revert_pos(); return error_code; } - - size_t pos{ir_buf.get_cursor_pos()}; - ir_buf.commit_internal_pos(); - metadata_pos = ir_buf.get_cursor_pos(); - if (ir_buf.size() < metadata_pos + metadata_size) { - ir_buf.set_cursor_pos(pos); + metadata_pos = ir_buf.get_pos(); + //TODO: this might not be optimal + if (ErrorCode_Success != ir_buf.try_seek_from_begin(metadata_pos + metadata_size)) { + ir_buf.revert_pos(); return IRErrorCode_Incomplete_IR; } - ir_buf.set_cursor_pos(metadata_pos + metadata_size); return IRErrorCode_Success; } namespace four_byte_encoding { - IRErrorCode decode_next_message (IrBuffer& ir_buf, string& message, + IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp_delta) { return generic_decode_next_message( @@ -542,12 +515,17 @@ namespace ffi::ir_stream { } namespace eight_byte_encoding { - IRErrorCode decode_next_message (IrBuffer& ir_buf, string& message, + IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp) { - return generic_decode_next_message( + ir_buf.mark_pos(); + auto error_code = generic_decode_next_message( ir_buf, message, timestamp ); + if (IRErrorCode_Success != error_code) { + ir_buf.revert_pos(); + } + return error_code; } } } diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 3060ca49d..5014320fa 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -7,86 +7,16 @@ // Project headers #include "../encoding_methods.hpp" - +#include "../../BufferReader.hpp" namespace ffi::ir_stream { - using encoded_tag_t = int8_t; - - /** - * Class representing an IR buffer that the decoder sequentially reads from. - * The class maintains an internal cursor such that every successful read - * increments the cursor. - */ - class IrBuffer { - public: - IrBuffer (const int8_t* data, size_t size) : - m_data(data), - m_size(size), - m_cursor_pos(0), - m_internal_cursor_pos(0) {} - - [[nodiscard]] size_t get_cursor_pos () const { return m_cursor_pos; } - void set_cursor_pos (size_t cursor_pos) { m_cursor_pos = cursor_pos; } - - // The following methods should only be used by the decoder - void init_internal_pos () { m_internal_cursor_pos = m_cursor_pos; } - void commit_internal_pos () { m_cursor_pos = m_internal_cursor_pos; } - size_t size () const { return m_size; } - - /** - * Tries reading a string view of size = read_size from the ir_buf. - * @param str_view Returns the string view - * @param read_size - * @return true on success, false if the ir_buf doesn't contain enough - * data to decode - **/ - [[nodiscard]] bool try_read (std::string_view& str_view, size_t read_size); - - /** - * Tries reading an integer of size = sizeof(integer_t) from the ir_buf - * @tparam integer_t - * @param data Returns the integer - * @return true on success, false if the ir_buf doesn't contain enough - * data to decode - */ - template - [[nodiscard]] bool try_read (integer_t& data); - - /** - * Tries reading data of size = read_size from the ir_buf. On success, - * stores the data into dest. - * @param dest - * @param read_size - * @return true on success, false if the ir_buf doesn't contain enough - * data to decode - */ - [[nodiscard]] bool try_read (void* dest, size_t read_size); - - private: - /** - * @param read_size - * @return Whether a read of the given size will exceed the size of the - * buffer - */ - [[nodiscard]] bool read_will_overflow (size_t read_size) const { - return (m_internal_cursor_pos + read_size) > m_size; - } - - const int8_t* const m_data; - const size_t m_size; - size_t m_cursor_pos; - // Internal cursor position to help restore cursor pos if/when decoding - // fails - size_t m_internal_cursor_pos; - }; + using encoded_tag_t = uint8_t; typedef enum { IRErrorCode_Success, IRErrorCode_Decode_Error, IRErrorCode_Eof, IRErrorCode_Corrupted_IR, - IRErrorCode_Corrupted_Metadata, IRErrorCode_Incomplete_IR, - IRErrorCode_Unsupported_Version, } IRErrorCode; /** @@ -98,7 +28,7 @@ namespace ffi::ir_stream { * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to * decode */ - IRErrorCode get_encoding_type (IrBuffer& ir_buf, bool& is_four_bytes_encoding); + IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding); /** * Decodes the preamble for an IR stream. @@ -111,7 +41,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough * data to decode */ - IRErrorCode decode_preamble (IrBuffer& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size); namespace eight_byte_encoding { @@ -128,7 +58,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (IrBuffer& ir_buf, std::string& message, + IRErrorCode decode_next_message (BufferReader& ir_buf, std::string& message, epoch_time_ms_t& timestamp); } @@ -146,7 +76,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (IrBuffer& ir_buf, std::string& message, + IRErrorCode decode_next_message (BufferReader& ir_buf, std::string& message, epoch_time_ms_t& timestamp_delta); } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index f6e936963..c9885f0ba 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -9,6 +9,7 @@ #include "../src/ffi/ir_stream/encoding_methods.hpp" #include "../src/ffi/ir_stream/decoding_methods.hpp" #include "../src/ffi/ir_stream/protocol_constants.hpp" +#include "../src/BufferReader.hpp" using ffi::decode_float_var; using ffi::decode_integer_var; @@ -24,8 +25,9 @@ using ffi::ir_stream::cProtocol::EightByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::FourByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::MagicNumberLength; using ffi::ir_stream::get_encoding_type; -using ffi::ir_stream::IrBuffer; +using ffi::ir_stream::decode_preamble; using ffi::ir_stream::IRErrorCode; +using ffi::ir_stream::encoded_tag_t; using ffi::VariablePlaceholder; using ffi::wildcard_query_matches_any_encoded_var; using std::chrono::duration_cast; @@ -95,7 +97,7 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log * encoded_variable_t == four_byte_encoded_variable_t */ template -IRErrorCode decode_next_message (IrBuffer& ir_buf, string& message, epoch_time_ms_t& decoded_ts); +IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& decoded_ts); /** * Struct to hold the timestamp info from the IR stream's metadata @@ -182,7 +184,7 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log } template -IRErrorCode decode_next_message (IrBuffer& ir_buf, string& message, epoch_time_ms_t& decoded_ts) { +IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& decoded_ts) { static_assert(is_same_v || is_same_v); @@ -209,7 +211,7 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector eight_byte_encoding_vec{EightByteEncodingMagicNumber, EightByteEncodingMagicNumber + MagicNumberLength}; - IrBuffer eight_byte_ir_buffer(eight_byte_encoding_vec.data(), + BufferReader eight_byte_ir_buffer(eight_byte_encoding_vec.data(), eight_byte_encoding_vec.size()); REQUIRE(get_encoding_type(eight_byte_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); @@ -219,7 +221,7 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - IrBuffer four_byte_ir_buffer(four_byte_encoding_vec.data(), + BufferReader four_byte_ir_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size()); REQUIRE(get_encoding_type(four_byte_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); @@ -227,17 +229,18 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { // Test error on empty and incomplete ir_buffer const vector empty_ir_vec; - IrBuffer empty_ir_buffer(empty_ir_vec.data(), empty_ir_vec.size()); + BufferReader empty_ir_buffer(empty_ir_vec.data(), empty_ir_vec.size()); REQUIRE(get_encoding_type(empty_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); - IrBuffer incomplete_ir_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size() - 1); + BufferReader incomplete_ir_buffer(four_byte_encoding_vec.data(), + four_byte_encoding_vec.size() - 1); REQUIRE(get_encoding_type(incomplete_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - IrBuffer invalid_ir_buffer(invalid_ir_vec.data(), invalid_ir_vec.size()); + BufferReader invalid_ir_buffer(invalid_ir_vec.data(), invalid_ir_vec.size()); REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -256,24 +259,26 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - IrBuffer preamble_buffer(ir_buf.data(), ir_buf.size()); + BufferReader preamble_buffer(ir_buf.data(), ir_buf.size()); bool is_four_bytes_encoding; REQUIRE(get_encoding_type(preamble_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - REQUIRE(MagicNumberLength == preamble_buffer.get_cursor_pos()); + REQUIRE(MagicNumberLength == preamble_buffer.get_pos()); // Test if preamble can be decoded correctly TimestampInfo ts_info; - int8_t metadata_type{0}; + encoded_tag_t metadata_type{0}; size_t metadata_pos{0}; uint16_t metadata_size{0}; REQUIRE(decode_preamble(preamble_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == preamble_buffer.get_cursor_pos()); + REQUIRE(encoded_preamble_end_pos == preamble_buffer.get_pos()); + + string_view json_metadata; + REQUIRE(ErrorCode_Success == preamble_buffer.try_seek_from_begin(metadata_pos)); + REQUIRE(preamble_buffer.try_read_string_view(json_metadata, metadata_size)); - char* metadata_ptr{size_checked_pointer_cast(ir_buf.data()) + metadata_pos}; - string_view json_metadata{metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); @@ -282,6 +287,8 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); REQUIRE(time_zone_id == ts_info.time_zone_id); REQUIRE(timestamp_pattern == ts_info.timestamp_pattern); + REQUIRE(encoded_preamble_end_pos == preamble_buffer.get_pos()); + if constexpr (is_same_v) { REQUIRE(reference_ts == std::stoll( @@ -291,15 +298,15 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if incomplete IR can be detected ir_buf.resize(encoded_preamble_end_pos - 1); - IrBuffer incomplete_preamble_buffer(ir_buf.data(), ir_buf.size()); - incomplete_preamble_buffer.set_cursor_pos(MagicNumberLength); + BufferReader incomplete_preamble_buffer(ir_buf.data(), ir_buf.size()); + incomplete_preamble_buffer.seek_from_begin(MagicNumberLength); REQUIRE(decode_preamble( incomplete_preamble_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; - IrBuffer corrupted_preamble_buffer(ir_buf.data(), ir_buf.size()); + BufferReader corrupted_preamble_buffer(ir_buf.data(), ir_buf.size()); REQUIRE(decode_preamble( corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -319,7 +326,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", const size_t encoded_message_end_pos = ir_buf.size(); const size_t encoded_message_start_pos = 0; - IrBuffer encoded_message_buffer(ir_buf.data(), ir_buf.size()); + BufferReader encoded_message_buffer(ir_buf.data(), ir_buf.size()); string decoded_message; epoch_time_ms_t timestamp; @@ -327,14 +334,14 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", decode_next_message(encoded_message_buffer, decoded_message, timestamp)); REQUIRE(message == decoded_message); REQUIRE(timestamp == reference_timestamp); - REQUIRE(encoded_message_buffer.get_cursor_pos() == encoded_message_end_pos); + REQUIRE(encoded_message_buffer.get_pos() == encoded_message_end_pos); - encoded_message_buffer.set_cursor_pos(encoded_message_start_pos + 1); + encoded_message_buffer.seek_from_begin(encoded_message_start_pos + 1); REQUIRE(IRErrorCode::IRErrorCode_Corrupted_IR == decode_next_message(encoded_message_buffer, message, timestamp)); ir_buf.resize(encoded_message_end_pos - 4); - IrBuffer incomplete_message_buffer(ir_buf.data(), ir_buf.size()); + BufferReader incomplete_message_buffer(ir_buf.data(), ir_buf.size()); REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR == decode_next_message(incomplete_message_buffer, message, timestamp)); } @@ -366,7 +373,7 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape{ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - IrBuffer ir_buffer_with_extra_escape(ir_with_extra_escape.data(), ir_with_extra_escape.size()); + BufferReader ir_buffer_with_extra_escape(ir_with_extra_escape.data(), ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer_with_extra_escape, decoded_message, timestamp)); @@ -375,7 +382,7 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - IrBuffer ir_buffer_with_extra_placeholder(ir_with_extra_escape.data(), + BufferReader ir_buffer_with_extra_placeholder(ir_with_extra_escape.data(), ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer_with_extra_placeholder, @@ -392,7 +399,7 @@ TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_mes REQUIRE(true == encode_message(reference_delta_ts_negative, message, logtype, ir_buf)); - IrBuffer encoded_message_buffer(ir_buf.data(), ir_buf.size()); + BufferReader encoded_message_buffer(ir_buf.data(), ir_buf.size()); string decoded_message; epoch_time_ms_t delta_ts; REQUIRE(IRErrorCode::IRErrorCode_Success == @@ -435,7 +442,7 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", reference_messages.push_back(message); reference_timestamps.push_back(ts); - IrBuffer complete_encoding_buffer(ir_buf.data(), ir_buf.size()); + BufferReader complete_encoding_buffer(ir_buf.data(), ir_buf.size()); bool is_four_bytes_encoding; REQUIRE(get_encoding_type(complete_encoding_buffer, is_four_bytes_encoding) == @@ -444,15 +451,16 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", // Test if preamble can be properly decoded TimestampInfo ts_info; - int8_t metadata_type; + encoded_tag_t metadata_type; size_t metadata_pos; uint16_t metadata_size; REQUIRE(decode_preamble(complete_encoding_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == complete_encoding_buffer.get_cursor_pos()); + REQUIRE(encoded_preamble_end_pos == complete_encoding_buffer.get_pos()); string_view json_metadata; - REQUIRE(complete_encoding_buffer.try_read(json_metadata, metadata_size)); + REQUIRE(ErrorCode_Success == complete_encoding_buffer.try_seek_from_begin(metadata_pos)); + REQUIRE(complete_encoding_buffer.try_read_string_view(json_metadata, metadata_size)); auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); @@ -471,5 +479,5 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", REQUIRE(decoded_message == reference_messages[ix]); REQUIRE(timestamp == reference_timestamps[ix]); } - REQUIRE(complete_encoding_buffer.get_cursor_pos() == ir_buf.size()); + REQUIRE(complete_encoding_buffer.get_pos() == ir_buf.size()); } From 3cb8f7c489388dbb3544d862be88191e840818d7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 7 May 2023 21:58:01 -0400 Subject: [PATCH 006/121] small clean up --- components/core/src/BufferReader.cpp | 18 ++++++++++++++++++ components/core/src/BufferReader.hpp | 24 ++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 329f08514..c153f1d1a 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -56,6 +56,24 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return error_code; } +void BufferReader::mark_pos () { + checkpoint_enable = true; + m_checkpoint_pos = m_cursor_pos; +} + +void BufferReader::revert_pos () { + if (false == checkpoint_enable) { + SPDLOG_ERROR("DictionaryWriter ran out of IDs."); + throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); + } + m_cursor_pos = m_checkpoint_pos; +} + +void BufferReader::reset_checkpoint () { + m_checkpoint_pos = 0; + checkpoint_enable = true; +} + bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size) { if ((m_cursor_pos + read_size) > m_size) { return false; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 50d6d8222..edb9de047 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -11,15 +11,29 @@ class BufferReader : public ReaderInterface { public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : + TraceableException (error_code, filename, line_number) {} + + // Methods + const char* what () const noexcept override { + return "BufferReader operation failed"; + } + }; BufferReader () : m_buffer(nullptr), m_size(0), m_cursor_pos(0), - m_checkpoint_pos(0) {} + m_checkpoint_pos(0), + checkpoint_enable(false) {} BufferReader (const int8_t* data, size_t size) : m_buffer(data), m_size(size), m_cursor_pos(0), - m_checkpoint_pos(0) {} + m_checkpoint_pos(0), + checkpoint_enable(false) {} [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; @@ -32,8 +46,9 @@ class BufferReader : public ReaderInterface { } // The following methods should only be used by the decoder - void mark_pos () { m_checkpoint_pos = m_cursor_pos; } - void revert_pos () { m_cursor_pos = m_checkpoint_pos; } + virtual void mark_pos (); + virtual void revert_pos (); + virtual void reset_checkpoint (); /** * Tries reading a string view of size = read_size from the ir_buf. @@ -47,6 +62,7 @@ class BufferReader : public ReaderInterface { private: const int8_t* m_buffer; + bool checkpoint_enable; size_t m_size; size_t m_cursor_pos; size_t m_checkpoint_pos; From 21a37a0571771959ee4e931c65ed94c3a5d06e67 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 00:44:18 -0400 Subject: [PATCH 007/121] Fixes but yet to verify --- components/core/src/FileReader.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index efd51ce9e..4d2b4a6d5 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -59,10 +59,10 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num size_t next_partial_read = num_bytes_to_read - remaining_data; memcpy(buf, m_read_buffer + m_buffer_pos, remaining_data); num_bytes_read = remaining_data; + m_file_pos += remaining_data; + m_buffer_pos = m_buffer_length; if (reached_eof) { - m_file_pos += remaining_data; - m_buffer_pos += m_buffer_length; num_bytes_read = remaining_data; if (num_bytes_read == 0) { return ErrorCode_EndOfFile; @@ -84,13 +84,14 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num memcpy(buf + num_bytes_read, m_read_buffer, next_partial_read); m_buffer_pos = next_partial_read; num_bytes_read += next_partial_read; - m_file_pos += num_bytes_read; + m_file_pos += next_partial_read; finish_reading = true; } else { // m_buffer_length < next_partial_read memcpy(buf + num_bytes_read, m_read_buffer, m_buffer_length); num_bytes_read += m_buffer_length; - m_file_pos += num_bytes_read; + m_file_pos += m_buffer_length; + m_buffer_pos = m_buffer_length; next_partial_read -= m_buffer_length; if (reached_eof) { finish_reading = true; @@ -201,13 +202,14 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo // read out a new buffer std::string_view substr {m_read_buffer + m_buffer_pos, m_buffer_length - m_buffer_pos}; str.append(substr); + m_file_pos += m_buffer_length - m_buffer_pos; // refill the buffer if (reached_eof) { + m_buffer_pos = m_buffer_length; return ErrorCode_EndOfFile; } - m_file_pos += m_buffer_length - m_buffer_pos; - m_buffer_pos = 0; m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); + m_buffer_pos = 0; if (m_buffer_length < cReaderBufferSize) { reached_eof = true; } From 827c76217c31aa85ec4695437689ab89ac603b66 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 01:57:42 -0400 Subject: [PATCH 008/121] small clean up --- components/core/CMakeLists.txt | 2 + components/core/src/BufferReader.hpp | 2 +- components/core/src/FileReader.cpp | 62 +++++++++++++++------------- components/core/src/FileReader.hpp | 27 +++++------- 4 files changed, 48 insertions(+), 45 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index cce526d73..4a35c52f9 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -159,6 +159,8 @@ FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}") set(SOURCE_FILES_clp src/ArrayBackedPosIntSet.cpp src/ArrayBackedPosIntSet.hpp + src/BufferReader.cpp + src/BufferReader.hpp src/clp/clp.cpp src/clp/CommandLineArguments.cpp src/clp/CommandLineArguments.hpp diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index edb9de047..2928dbb9f 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -59,7 +59,7 @@ class BufferReader : public ReaderInterface { **/ [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); -private: +protected: const int8_t* m_buffer; bool checkpoint_enable; diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 4d2b4a6d5..1b245c13a 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -30,6 +30,18 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } +ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { + // refill the local buffer + m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (m_buffer_length < num_bytes_to_read) { + reached_eof = true; + } + if (m_buffer_length == -1) { + return ErrorCode_errno; + } + return ErrorCode_Success; +} + ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (-1 == m_fd) { return ErrorCode_NotInit; @@ -43,7 +55,6 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num if (offset != m_file_pos) { return ErrorCode_errno; } - } size_t remaining_data = m_buffer_length - m_buffer_pos; @@ -73,12 +84,9 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num bool finish_reading = false; while (false == finish_reading) { // refill the buffer - m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); - if (m_buffer_length == -1) { - return ErrorCode_errno; - } - if (m_buffer_length < cReaderBufferSize) { - reached_eof = true; + if (auto error_code = refill_reader_buffer(cReaderBufferSize); + ErrorCode_Success != error_code) { + return error_code; } if (m_buffer_length >= next_partial_read) { memcpy(buf + num_bytes_read, m_read_buffer, next_partial_read); @@ -161,14 +169,14 @@ ErrorCode FileReader::try_open (const string& path) { return ErrorCode_errno; } m_path = path; - m_buffer_pos = 0; m_file_pos = 0; - // If I open here, later I may get eof error, so I can not open here - // so early read might not be a good idea - m_buffer_length = 0; reached_eof = false; started_reading = false; + // Buffer specific things + m_buffer_pos = 0; + m_buffer_length = 0; + reset_buffer(m_read_buffer, m_buffer_length); return ErrorCode_Success; } @@ -192,7 +200,8 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo } if (found_delim) { // append to strings - std::string_view substr {m_read_buffer + m_buffer_pos, cursor + 1 - m_buffer_pos}; + std::string_view substr(reinterpret_cast(m_read_buffer + m_buffer_pos), + cursor + 1 - m_buffer_pos); str.append(substr); // increase file pos m_file_pos += (cursor + 1) - m_buffer_pos; @@ -200,25 +209,25 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo } else { // if we didn't find a delimiter, we append the current buffer to the str and // read out a new buffer - std::string_view substr {m_read_buffer + m_buffer_pos, m_buffer_length - m_buffer_pos}; + auto remaining_data_size = m_buffer_length - m_buffer_pos; + std::string_view substr(reinterpret_cast(m_read_buffer + m_buffer_pos), + remaining_data_size); str.append(substr); - m_file_pos += m_buffer_length - m_buffer_pos; + m_file_pos += remaining_data_size; // refill the buffer if (reached_eof) { m_buffer_pos = m_buffer_length; return ErrorCode_EndOfFile; } - m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); - m_buffer_pos = 0; - if (m_buffer_length < cReaderBufferSize) { - reached_eof = true; - } - if (m_buffer_length == -1) { - return ErrorCode_errno; - } - if (m_buffer_length == 0) { - return ErrorCode_EndOfFile; + // this place is little weird. need to think carefully. + // if the buffer_length = 0, then reach_eof = true, after that, we'll + // enter the next loop and then exit + // refill the buffer + if (auto error_code = refill_reader_buffer(cReaderBufferSize); + ErrorCode_Success != error_code) { + return error_code; } + m_buffer_pos = 0; } } return ErrorCode_Success; @@ -239,10 +248,7 @@ void FileReader::close () { if (-1 != m_fd) { // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted // by a signal - auto res = ::close(m_fd); - if (0 != res) { - throw "Not sure why close fail\n"; - } + ::close(m_fd); m_fd = -1; } } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 801e8c3d8..38c3afe12 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -11,11 +11,11 @@ // Project headers #include "Defs.h" #include "ErrorCode.hpp" -#include "ReaderInterface.hpp" +#include "BufferReader.hpp" #include "TraceableException.hpp" -class FileReader : public ReaderInterface { +class FileReader : public BufferReader { public: // Types class OperationFailed : public TraceableException { @@ -31,7 +31,7 @@ class FileReader : public ReaderInterface { // Constructors FileReader() : m_file_pos(0), m_buffer_pos(0), m_fd(-1) { - m_read_buffer = (char*)malloc(sizeof(char) * cReaderBufferSize); + m_read_buffer = reinterpret_cast(malloc(sizeof(int8_t) * cReaderBufferSize)); } ~FileReader(); // Methods implementing the ReaderInterface @@ -111,26 +111,21 @@ class FileReader : public ReaderInterface { private: + + ErrorCode refill_reader_buffer(size_t num_bytes_to_read); + // Types size_t m_file_pos; - ssize_t m_buffer_length; - size_t m_buffer_pos; - char* m_read_buffer; int m_fd; std::string m_path; bool reached_eof; bool started_reading; - static constexpr size_t cReaderBufferSize = 1024; - // Constants - - // Factory functions - - // Assignment operators - - // Methods - - // Variables + // Buffer specific data + ssize_t m_buffer_length; + size_t m_buffer_pos; + int8_t* m_read_buffer; + static constexpr size_t cReaderBufferSize = 1024; }; From 1971ae41b32da866f871f66fee47e286284f29eb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 16:59:45 -0400 Subject: [PATCH 009/121] Some refactoring and decompression looks ok --- components/core/src/BufferReader.hpp | 1 + components/core/src/FileReader.cpp | 71 +++++++++++++++------------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 2928dbb9f..fb08a6a54 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -43,6 +43,7 @@ class BufferReader : public ReaderInterface { void reset_buffer (const int8_t* data, size_t size) { m_buffer = data; m_size = size; + m_cursor_pos = 0; } // The following methods should only be used by the decoder diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 1b245c13a..49cf34d17 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -39,6 +39,8 @@ ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { if (m_buffer_length == -1) { return ErrorCode_errno; } + reset_buffer(m_read_buffer, m_buffer_length); + return ErrorCode_Success; } @@ -52,62 +54,66 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num if (started_reading == false) { started_reading = true; auto offset = lseek(m_fd, m_file_pos, SEEK_SET); - if (offset != m_file_pos) { + if (offset == -1) { return ErrorCode_errno; } + auto error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_Success != error_code) { + return error_code; + } } - size_t remaining_data = m_buffer_length - m_buffer_pos; - if (num_bytes_to_read <= remaining_data) { - memcpy(buf, m_read_buffer + m_buffer_pos, num_bytes_to_read); - // increment cursors - m_buffer_pos += num_bytes_to_read; - m_file_pos += num_bytes_to_read; - num_bytes_read = num_bytes_to_read; - } else { - // else if data is not enough. - // first, read everything from buffer - size_t next_partial_read = num_bytes_to_read - remaining_data; - memcpy(buf, m_read_buffer + m_buffer_pos, remaining_data); - num_bytes_read = remaining_data; - m_file_pos += remaining_data; - m_buffer_pos = m_buffer_length; + num_bytes_read = 0; + size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; + size_t num_bytes_read_from_buffer; + auto error_code = BufferReader::try_read(buf + num_bytes_read, num_bytes_to_read_from_buffer, + num_bytes_read_from_buffer); + if (ErrorCode_Success == error_code) { + // if success, means the buffer still has enough data to read from + m_file_pos += num_bytes_read_from_buffer; + num_bytes_read = num_bytes_read_from_buffer; + return ErrorCode_Success; + } else if (ErrorCode_EndOfFile == error_code) { + // else, data is not enough. but anyway we have already readed some + m_file_pos += num_bytes_read_from_buffer; + num_bytes_read += num_bytes_read_from_buffer; + num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; + // if we know the file has been exhausted. if (reached_eof) { - num_bytes_read = remaining_data; if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } return ErrorCode_Success; } - + // else, we should refill the buffer and keep reading bool finish_reading = false; while (false == finish_reading) { // refill the buffer - if (auto error_code = refill_reader_buffer(cReaderBufferSize); - ErrorCode_Success != error_code) { + error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_Success != error_code) { return error_code; } - if (m_buffer_length >= next_partial_read) { - memcpy(buf + num_bytes_read, m_read_buffer, next_partial_read); - m_buffer_pos = next_partial_read; - num_bytes_read += next_partial_read; - m_file_pos += next_partial_read; + // now try to read from the buffer + error_code = BufferReader::try_read(buf + num_bytes_read, + num_bytes_to_read_from_buffer, + num_bytes_read_from_buffer); + if (ErrorCode_Success == error_code) { finish_reading = true; } else { - // m_buffer_length < next_partial_read - memcpy(buf + num_bytes_read, m_read_buffer, m_buffer_length); - num_bytes_read += m_buffer_length; - m_file_pos += m_buffer_length; - m_buffer_pos = m_buffer_length; - next_partial_read -= m_buffer_length; if (reached_eof) { finish_reading = true; } } + num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; + m_file_pos += num_bytes_read_from_buffer; + num_bytes_read += num_bytes_read_from_buffer; } + return ErrorCode_Success; + } else { + // else some unexpected error code is encountered. + throw OperationFailed(error_code, __FILENAME__, __LINE__); } - return ErrorCode_Success; } // Maybe everytime, I should always read a page? @@ -176,7 +182,6 @@ ErrorCode FileReader::try_open (const string& path) { // Buffer specific things m_buffer_pos = 0; m_buffer_length = 0; - reset_buffer(m_read_buffer, m_buffer_length); return ErrorCode_Success; } From 1764cfbc70925cf71fc4a8714630fc0f0d6ebeef Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 17:16:11 -0400 Subject: [PATCH 010/121] ok for compression --- components/core/src/FileReader.cpp | 34 +++++++++++------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 49cf34d17..57e2c7d2c 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -195,33 +195,23 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo bool found_delim {false}; while (false == found_delim) { - auto cursor {m_buffer_pos}; - while (cursor < m_buffer_length) { - if (delim == m_read_buffer[cursor]) { + auto cursor {m_cursor_pos}; + while (cursor < m_size && false == found_delim) { + if (delim == m_buffer[cursor]) { found_delim = true; - break; } cursor++; } - if (found_delim) { - // append to strings - std::string_view substr(reinterpret_cast(m_read_buffer + m_buffer_pos), - cursor + 1 - m_buffer_pos); - str.append(substr); - // increase file pos - m_file_pos += (cursor + 1) - m_buffer_pos; - m_buffer_pos = cursor + 1; - } else { - // if we didn't find a delimiter, we append the current buffer to the str and - // read out a new buffer - auto remaining_data_size = m_buffer_length - m_buffer_pos; - std::string_view substr(reinterpret_cast(m_read_buffer + m_buffer_pos), - remaining_data_size); - str.append(substr); - m_file_pos += remaining_data_size; + // append to strings + std::string_view substr(reinterpret_cast(m_buffer + m_cursor_pos), + cursor - m_cursor_pos); + str.append(substr); + // increase file pos + m_file_pos += cursor - m_cursor_pos; + m_cursor_pos = cursor; + if (false == found_delim) { // refill the buffer if (reached_eof) { - m_buffer_pos = m_buffer_length; return ErrorCode_EndOfFile; } // this place is little weird. need to think carefully. @@ -232,7 +222,7 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo ErrorCode_Success != error_code) { return error_code; } - m_buffer_pos = 0; + m_cursor_pos = 0; } } return ErrorCode_Success; From 53190678750951fbd0b9ebbcbc168e951b336d76 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 21:01:27 -0400 Subject: [PATCH 011/121] bug fix for proper compression --- components/core/src/BufferReader.cpp | 1 - components/core/src/FileReader.cpp | 97 ++++++++++++---------------- components/core/src/FileReader.hpp | 2 +- 3 files changed, 42 insertions(+), 58 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index c153f1d1a..57d4ca8fd 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -29,7 +29,6 @@ using std::string_view; } ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (nullptr == m_buffer) { return ErrorCode_NotInit; } diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 57e2c7d2c..9ad58b57d 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -31,15 +31,15 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { } ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { - // refill the local buffer - m_buffer_length = ::read(m_fd, m_read_buffer, cReaderBufferSize); - if (m_buffer_length < num_bytes_to_read) { + size_t num_bytes_read; + num_bytes_read = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (num_bytes_read < num_bytes_to_read) { reached_eof = true; } - if (m_buffer_length == -1) { + if (num_bytes_read == -1) { return ErrorCode_errno; } - reset_buffer(m_read_buffer, m_buffer_length); + reset_buffer(m_read_buffer, num_bytes_read); return ErrorCode_Success; } @@ -51,69 +51,59 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num if (nullptr == buf) { return ErrorCode_BadParam; } + // Question: should we delay the fseek? if (started_reading == false) { started_reading = true; auto offset = lseek(m_fd, m_file_pos, SEEK_SET); if (offset == -1) { return ErrorCode_errno; } - auto error_code = refill_reader_buffer(cReaderBufferSize); - if (ErrorCode_Success != error_code) { - return error_code; - } } num_bytes_read = 0; size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; size_t num_bytes_read_from_buffer; - auto error_code = BufferReader::try_read(buf + num_bytes_read, num_bytes_to_read_from_buffer, - num_bytes_read_from_buffer); - if (ErrorCode_Success == error_code) { - // if success, means the buffer still has enough data to read from - m_file_pos += num_bytes_read_from_buffer; - num_bytes_read = num_bytes_read_from_buffer; - return ErrorCode_Success; - } else if (ErrorCode_EndOfFile == error_code) { - // else, data is not enough. but anyway we have already readed some - m_file_pos += num_bytes_read_from_buffer; - num_bytes_read += num_bytes_read_from_buffer; - num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - // if we know the file has been exhausted. - if (reached_eof) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; - } - // else, we should refill the buffer and keep reading - bool finish_reading = false; - while (false == finish_reading) { - // refill the buffer + // keep reading + bool finish_reading = false; + while (false == finish_reading) { + auto error_code = BufferReader::try_read(buf + num_bytes_read, + num_bytes_to_read_from_buffer, + num_bytes_read_from_buffer); + if (ErrorCode_NotInit == error_code) { + // else, we refill the buffer error_code = refill_reader_buffer(cReaderBufferSize); if (ErrorCode_Success != error_code) { return error_code; } - // now try to read from the buffer - error_code = BufferReader::try_read(buf + num_bytes_read, - num_bytes_to_read_from_buffer, - num_bytes_read_from_buffer); + } else { + m_file_pos += num_bytes_read_from_buffer; + num_bytes_read += num_bytes_read_from_buffer; + num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; if (ErrorCode_Success == error_code) { + // if success, means the buffer still has enough data to read from finish_reading = true; - } else { + } else if (ErrorCode_EndOfFile == error_code) { + // if the buffer is not loaded or has been exhausted. + // simply return if (reached_eof) { - finish_reading = true; + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; } + // else, we refill the buffer + error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_Success != error_code) { + return error_code; + } + } else { + // else some unexpected error code is encountered. + throw OperationFailed(error_code, __FILENAME__, __LINE__); } - num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - m_file_pos += num_bytes_read_from_buffer; - num_bytes_read += num_bytes_read_from_buffer; } - return ErrorCode_Success; - } else { - // else some unexpected error code is encountered. - throw OperationFailed(error_code, __FILENAME__, __LINE__); } + return ErrorCode_Success; } // Maybe everytime, I should always read a page? @@ -129,13 +119,15 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { // return ErrorCode_EndOfFile; // } + // if we are at A, readed something, seek to B which is on another buffer place. + // and seek back to A, how will this be handled if (pos > m_file_pos) { auto front_seek_amount = pos - m_file_pos; - if (front_seek_amount > m_buffer_length - m_buffer_pos) { + if (front_seek_amount > m_size - m_cursor_pos) { // if the seek-to pos is out of buffer printf("Seek front on %d\n", m_fd); - m_buffer_length = 0; - m_buffer_pos = 0; + m_size = 0; + m_cursor_pos = 0; m_file_pos = pos; } else { // otherwise, we can simply @@ -180,8 +172,7 @@ ErrorCode FileReader::try_open (const string& path) { started_reading = false; // Buffer specific things - m_buffer_pos = 0; - m_buffer_length = 0; + reset_buffer(nullptr, 0); return ErrorCode_Success; } @@ -210,19 +201,13 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bo m_file_pos += cursor - m_cursor_pos; m_cursor_pos = cursor; if (false == found_delim) { - // refill the buffer if (reached_eof) { return ErrorCode_EndOfFile; } - // this place is little weird. need to think carefully. - // if the buffer_length = 0, then reach_eof = true, after that, we'll - // enter the next loop and then exit - // refill the buffer if (auto error_code = refill_reader_buffer(cReaderBufferSize); ErrorCode_Success != error_code) { return error_code; } - m_cursor_pos = 0; } } return ErrorCode_Success; diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 38c3afe12..6203fbc53 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -125,7 +125,7 @@ class FileReader : public BufferReader { ssize_t m_buffer_length; size_t m_buffer_pos; int8_t* m_read_buffer; - static constexpr size_t cReaderBufferSize = 1024; + static constexpr size_t cReaderBufferSize = 4096; }; From f616add9096d8f180bbb0abac19d12c87d62baa7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 8 May 2023 23:35:23 -0400 Subject: [PATCH 012/121] make error code interface matches with what's expected --- components/core/src/BufferReader.cpp | 15 ++----- components/core/src/FileReader.cpp | 43 +++++++++++-------- components/core/src/FileReader.hpp | 9 ++-- .../src/ffi/ir_stream/decoding_methods.cpp | 4 +- 4 files changed, 33 insertions(+), 38 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 57d4ca8fd..dc3217e8e 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -40,19 +40,10 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_EndOfFile; } - ErrorCode error_code; - auto data_available = m_size - m_cursor_pos; - if (data_available < num_bytes_to_read) { - memcpy(buf, m_buffer + m_cursor_pos, data_available); - num_bytes_read = data_available; - error_code = ErrorCode_EndOfFile; - } else { - memcpy(buf, m_buffer + m_cursor_pos, num_bytes_to_read); - num_bytes_read = num_bytes_to_read; - error_code = ErrorCode_Success; - } + num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); + memcpy(buf, m_buffer + m_cursor_pos, num_bytes_read); m_cursor_pos += num_bytes_read; - return error_code; + return ErrorCode_Success; } void BufferReader::mark_pos () { diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 9ad58b57d..94aa46288 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -76,31 +76,38 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num if (ErrorCode_Success != error_code) { return error_code; } - } else { + } else if (ErrorCode_Success == error_code) { m_file_pos += num_bytes_read_from_buffer; num_bytes_read += num_bytes_read_from_buffer; num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - if (ErrorCode_Success == error_code) { - // if success, means the buffer still has enough data to read from + if (num_bytes_to_read_from_buffer == 0) { finish_reading = true; - } else if (ErrorCode_EndOfFile == error_code) { - // if the buffer is not loaded or has been exhausted. - // simply return - if (reached_eof) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; - } + } else { // else, we refill the buffer error_code = refill_reader_buffer(cReaderBufferSize); if (ErrorCode_Success != error_code) { return error_code; } - } else { - // else some unexpected error code is encountered. - throw OperationFailed(error_code, __FILENAME__, __LINE__); } + } else if (ErrorCode_EndOfFile == error_code) { + // if we encounter the end of file, means the buffer + // happens to be drained out and we didn't read any data from it + // in this case, exit if reached_eof, or simply refill the buffer. + if (reached_eof) { + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } else { + return ErrorCode_Success; + } + } + // else, we refill the buffer + error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_Success != error_code) { + return error_code; + } + } else { + // else some unexpected error code is encountered. + throw OperationFailed(error_code, __FILENAME__, __LINE__); } } return ErrorCode_Success; @@ -132,7 +139,7 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { } else { // otherwise, we can simply printf("simple seek front on %d\n", m_fd); - m_buffer_pos += front_seek_amount; + m_cursor_pos += front_seek_amount; m_file_pos = pos; } } else { @@ -142,11 +149,11 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { m_file_pos = pos; } else { auto seek_back_amount = m_file_pos - pos; - if (seek_back_amount > m_buffer_pos) { + if (seek_back_amount > m_cursor_pos) { SPDLOG_ERROR("Can't back trace anymore"); throw; } else { - m_buffer_pos = m_buffer_pos - seek_back_amount; + m_cursor_pos = m_cursor_pos - seek_back_amount; m_file_pos = pos; } } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 6203fbc53..5407be453 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -30,7 +30,7 @@ class FileReader : public BufferReader { }; // Constructors - FileReader() : m_file_pos(0), m_buffer_pos(0), m_fd(-1) { + FileReader() : m_file_pos(0), m_fd(-1) { m_read_buffer = reinterpret_cast(malloc(sizeof(int8_t) * cReaderBufferSize)); } ~FileReader(); @@ -107,8 +107,7 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - ErrorCode try_fstat (struct stat& stat_buffer); - + [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer); private: @@ -122,10 +121,8 @@ class FileReader : public BufferReader { bool started_reading; // Buffer specific data - ssize_t m_buffer_length; - size_t m_buffer_pos; int8_t* m_read_buffer; - static constexpr size_t cReaderBufferSize = 4096; + static constexpr size_t cReaderBufferSize = 65536; }; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 1a3742636..5680cb5f8 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -467,8 +467,8 @@ namespace ffi::ir_stream { char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; - if (ErrorCode_Success != ir_buf.try_read(buffer, cProtocol::MagicNumberLength, - num_bytes_read)) { + auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); + if (error_code != ErrorCode_Success || num_bytes_read != cProtocol::MagicNumberLength) { ir_buf.revert_pos(); return IRErrorCode_Incomplete_IR; } From 8d2fce04757a93d482046f92ff4e0c1886f7c2e4 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 9 May 2023 23:51:10 -0400 Subject: [PATCH 013/121] Support checkpoint in new file reader --- components/core/CMakeLists.txt | 4 + components/core/src/BufferReader.cpp | 2 +- components/core/src/BufferReader.hpp | 4 +- components/core/src/FileReader.cpp | 172 +++++++++++++++++++-------- components/core/src/FileReader.hpp | 24 ++-- 5 files changed, 149 insertions(+), 57 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 4a35c52f9..4f956a018 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -339,6 +339,8 @@ target_compile_features(clp ) set(SOURCE_FILES_clg + src/BufferReader.cpp + src/BufferReader.hpp src/clg/clg.cpp src/clg/CommandLineArguments.cpp src/clg/CommandLineArguments.hpp @@ -486,6 +488,8 @@ target_compile_features(clg ) set(SOURCE_FILES_clo + src/BufferReader.cpp + src/BufferReader.hpp src/clo/clo.cpp src/clo/CommandLineArguments.cpp src/clo/CommandLineArguments.hpp diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index dc3217e8e..70bf4a76b 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -61,7 +61,7 @@ void BufferReader::revert_pos () { void BufferReader::reset_checkpoint () { m_checkpoint_pos = 0; - checkpoint_enable = true; + checkpoint_enable = false; } bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size) { diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index fb08a6a54..17a1b1e74 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -63,9 +63,11 @@ class BufferReader : public ReaderInterface { protected: const int8_t* m_buffer; - bool checkpoint_enable; size_t m_size; size_t m_cursor_pos; + +private: + bool checkpoint_enable; size_t m_checkpoint_pos; }; diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 94aa46288..b2a30f6bf 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -32,15 +32,28 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { size_t num_bytes_read; - num_bytes_read = ::read(m_fd, m_read_buffer, cReaderBufferSize); - if (num_bytes_read < num_bytes_to_read) { - reached_eof = true; - } - if (num_bytes_read == -1) { - return ErrorCode_errno; + if (false == m_checkpoint_enabled) { + num_bytes_read = ::read(m_fd, m_read_buffer, cReaderBufferSize); + if (num_bytes_read < num_bytes_to_read) { + reached_eof = true; + } + if (num_bytes_read == -1) { + return ErrorCode_errno; + } + reset_buffer(m_read_buffer, num_bytes_read); + } else { + // increase buffer size + m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + cReaderBufferSize); + m_buffer = m_read_buffer; + num_bytes_read = ::read(m_fd, m_read_buffer + m_size, cReaderBufferSize); + m_size += cReaderBufferSize; + if (num_bytes_read < num_bytes_to_read) { + reached_eof = true; + } + if (num_bytes_read == -1) { + return ErrorCode_errno; + } } - reset_buffer(m_read_buffer, num_bytes_read); - return ErrorCode_Success; } @@ -51,14 +64,6 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num if (nullptr == buf) { return ErrorCode_BadParam; } - // Question: should we delay the fseek? - if (started_reading == false) { - started_reading = true; - auto offset = lseek(m_fd, m_file_pos, SEEK_SET); - if (offset == -1) { - return ErrorCode_errno; - } - } num_bytes_read = 0; size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; @@ -118,45 +123,66 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { if (m_fd == -1) { return ErrorCode_NotInit; } - //TODO: do we need to detect out of range seek? -// struct stat st; -// fstat(m_fd, &st); -// off_t size = st.st_size; -// if (pos >= size) { -// return ErrorCode_EndOfFile; -// } + // early return path + if (pos == m_file_pos) { + return ErrorCode_Success; + } // if we are at A, readed something, seek to B which is on another buffer place. // and seek back to A, how will this be handled if (pos > m_file_pos) { auto front_seek_amount = pos - m_file_pos; if (front_seek_amount > m_size - m_cursor_pos) { - // if the seek-to pos is out of buffer - printf("Seek front on %d\n", m_fd); - m_size = 0; - m_cursor_pos = 0; - m_file_pos = pos; + if (m_checkpoint_enabled == false) { + // This should only be needed by GLT as CLP decompress the entire region + // into the passthrough buffer + SPDLOG_ERROR("haven't thought about this yet"); + throw; + m_file_pos = pos; + // we will require to load buffer later + reset_buffer(nullptr, 0); + } else { + // Get the file size + struct stat fileInfo; + fstat(m_fd, &fileInfo); + off_t file_size = fileInfo.st_size; + if (pos > file_size) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw; + } + + size_t data_read_remaining = front_seek_amount; + // we want to load all file contents between into the buffer + while (true) { + // note, although we refill the buffer, we didn't adjust the + // cur_pos; + refill_reader_buffer(cReaderBufferSize); + if (data_read_remaining < cReaderBufferSize) { + m_file_pos = pos; + m_cursor_pos += front_seek_amount; + // then we are done. + break; + } + data_read_remaining -= cReaderBufferSize; + } + } } else { - // otherwise, we can simply - printf("simple seek front on %d\n", m_fd); + // otherwise, we can simply seek in the same buffer; m_cursor_pos += front_seek_amount; m_file_pos = pos; } } else { - printf("Seek back on %d\n", m_fd); - // the maximum value we can seek back is m_buffer_pos; - if(started_reading == false) { - m_file_pos = pos; - } else { - auto seek_back_amount = m_file_pos - pos; - if (seek_back_amount > m_cursor_pos) { - SPDLOG_ERROR("Can't back trace anymore"); - throw; - } else { - m_cursor_pos = m_cursor_pos - seek_back_amount; - m_file_pos = pos; - } + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); + return ErrorCode_Failure; } + if (pos < m_checkpointed_pos) { + SPDLOG_ERROR("Seek back before the checkpoint is not supported"); + return ErrorCode_Failure; + } + m_file_pos = pos; + BufferReader::revert_pos(); + m_cursor_pos += pos - m_checkpointed_pos; } return ErrorCode_Success; } @@ -176,14 +202,14 @@ ErrorCode FileReader::try_open (const string& path) { m_path = path; m_file_pos = 0; reached_eof = false; - started_reading = false; // Buffer specific things reset_buffer(nullptr, 0); return ErrorCode_Success; } -ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { +ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, string& str) { assert(-1 != m_fd); if (false == append) { @@ -231,16 +257,66 @@ void FileReader::open (const string& path) { } } +void FileReader::revert_pos() { + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Checkpoint is not enabled"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_file_pos = m_checkpointed_pos; + // this should have revert the pos to the original buffer pos + BufferReader::revert_pos(); +} + +void FileReader::mark_pos() { + if (true == m_checkpoint_enabled) { + SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_checkpointed_pos = m_file_pos; + m_checkpoint_enabled = true; + BufferReader::mark_pos(); +} + +// let's assume the checkpoint can only be reset if we are already reading +// recent data +void FileReader::reset_checkpoint () { + + // this basically means we don't allow to reset yet + // because currently we are still reading from buffered data + if (m_size - m_cursor_pos > cReaderBufferSize) { + SPDLOG_ERROR("Not ready for reset checkpoint"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + if(m_size > cReaderBufferSize) { + int8_t* new_buffer = (int8_t*)malloc(sizeof(int8_t) * cReaderBufferSize); + // copy the last "page" of data over. + size_t copy_pos = m_size - cReaderBufferSize; + memcpy(new_buffer, m_buffer + copy_pos, cReaderBufferSize); + free(m_read_buffer); + m_read_buffer = new_buffer; + + // here, we don't need to touch m_cursor_pos yet; + m_buffer = new_buffer; + } + m_checkpoint_enabled = false; + BufferReader::reset_checkpoint(); +} + void FileReader::close () { if (-1 != m_fd) { - // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted - // by a signal + // NOTE: We don't check errors for fclose since it seems + // the only reason it could fail is if it was interrupted by a signal ::close(m_fd); m_fd = -1; + + if (m_checkpoint_enabled) { + m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); + m_checkpoint_enabled = false; + } } } -ErrorCode FileReader::try_fstat (struct stat& stat_buffer) { +ErrorCode FileReader::try_fstat (struct stat& stat_buffer) const { if (-1 == m_fd) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 5407be453..2d699f448 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -21,16 +21,18 @@ class FileReader : public BufferReader { class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : TraceableException (error_code, filename, line_number) {} + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : + TraceableException (error_code, filename, line_number) {} // Methods - const char* what () const noexcept override { + [[nodiscard]] const char* what () const noexcept override { return "FileReader operation failed"; } }; // Constructors - FileReader() : m_file_pos(0), m_fd(-1) { + FileReader() : m_file_pos(0), m_fd(-1), reached_eof(false), m_checkpoint_enabled(false) + { m_read_buffer = reinterpret_cast(malloc(sizeof(int8_t) * cReaderBufferSize)); } ~FileReader(); @@ -75,10 +77,11 @@ class FileReader : public BufferReader { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_errno otherwise */ - ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; + ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, std::string& str) override; // Methods - bool is_open () const { return -1 != m_fd; } + [[nodiscard]] bool is_open () const { return -1 != m_fd; } /** * Tries to open a file @@ -107,7 +110,11 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer); + [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; + + void mark_pos(); + void revert_pos(); + void reset_checkpoint (); private: @@ -118,11 +125,14 @@ class FileReader : public BufferReader { int m_fd; std::string m_path; bool reached_eof; - bool started_reading; // Buffer specific data int8_t* m_read_buffer; static constexpr size_t cReaderBufferSize = 65536; + + // checkpoint specific data + bool m_checkpoint_enabled; + size_t m_checkpointed_pos; }; From d2fc530eadc13a527b196763dfeca13482eb7844 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 9 May 2023 23:57:42 -0400 Subject: [PATCH 014/121] Make existing clp code use checkpoint feature --- components/core/src/dictionary_utils.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/components/core/src/dictionary_utils.cpp b/components/core/src/dictionary_utils.cpp index 7fd57df90..10a8a3542 100644 --- a/components/core/src/dictionary_utils.cpp +++ b/components/core/src/dictionary_utils.cpp @@ -5,12 +5,14 @@ void open_dictionary_for_reading (const std::string& dictionary_path, const std: FileReader& segment_index_file_reader, streaming_compression::Decompressor& segment_index_decompressor) { dictionary_file_reader.open(dictionary_path); + dictionary_file_reader.mark_pos(); // Skip header dictionary_file_reader.seek_from_begin(sizeof(uint64_t)); // Open decompressor dictionary_decompressor.open(dictionary_file_reader, decompressor_file_read_buffer_capacity); segment_index_file_reader.open(segment_index_path); + segment_index_file_reader.mark_pos(); // Skip header segment_index_file_reader.seek_from_begin(sizeof(uint64_t)); // Open decompressor @@ -23,6 +25,7 @@ uint64_t read_dictionary_header (FileReader& file_reader) { uint64_t num_dictionary_entries; file_reader.read_numeric_value(num_dictionary_entries, false); file_reader.seek_from_begin(dictionary_file_reader_pos); + file_reader.reset_checkpoint(); return num_dictionary_entries; } @@ -33,5 +36,6 @@ uint64_t read_segment_index_header (FileReader& file_reader) { uint64_t num_segments; file_reader.read_numeric_value(num_segments, false); file_reader.seek_from_begin(segment_index_file_reader_pos); + file_reader.reset_checkpoint(); return num_segments; } From 921517a79ebbd26abf02df2b26c0697357960940 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 10 May 2023 23:37:58 -0400 Subject: [PATCH 015/121] updated some comments on potential improvements and add a temporary unit test to help refactoring --- components/core/src/FileReader.cpp | 50 +++-- components/core/src/FileReader.hpp | 4 +- components/core/tests/test-FileReader.cpp | 211 ++++++++++++++++++++++ 3 files changed, 249 insertions(+), 16 deletions(-) create mode 100644 components/core/tests/test-FileReader.cpp diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index b2a30f6bf..4060f1648 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -30,10 +30,13 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } +//TODO: return number of bytes I readed to avoid eof check ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { size_t num_bytes_read; if (false == m_checkpoint_enabled) { num_bytes_read = ::read(m_fd, m_read_buffer, cReaderBufferSize); + + // todo: keep reading until you see 0 if (num_bytes_read < num_bytes_to_read) { reached_eof = true; } @@ -75,8 +78,11 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num auto error_code = BufferReader::try_read(buf + num_bytes_read, num_bytes_to_read_from_buffer, num_bytes_read_from_buffer); + // what if we make the error to be eof if (ErrorCode_NotInit == error_code) { // else, we refill the buffer + // TODO: we can do something special about the curpos + // if we know only two paths will lead to this place. error_code = refill_reader_buffer(cReaderBufferSize); if (ErrorCode_Success != error_code) { return error_code; @@ -128,19 +134,24 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } - // if we are at A, readed something, seek to B which is on another buffer place. - // and seek back to A, how will this be handled if (pos > m_file_pos) { auto front_seek_amount = pos - m_file_pos; if (front_seek_amount > m_size - m_cursor_pos) { if (m_checkpoint_enabled == false) { - // This should only be needed by GLT as CLP decompress the entire region - // into the passthrough buffer - SPDLOG_ERROR("haven't thought about this yet"); - throw; + // let's assume we want the read to be always page aligned + auto buffer_aligned_pos = pos & cBufferAlignedMask; + auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); + if (offset == -1) { + return ErrorCode_errno; + } + // now the issue is that: if we want to delay buffer loading + // how can we propogate the right m_cursor_pos + if (auto error_code = refill_reader_buffer(cReaderBufferSize); + ErrorCode_Success != error_code) { + return error_code; + } m_file_pos = pos; - // we will require to load buffer later - reset_buffer(nullptr, 0); + m_cursor_pos = pos - buffer_aligned_pos; } else { // Get the file size struct stat fileInfo; @@ -153,10 +164,16 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { size_t data_read_remaining = front_seek_amount; // we want to load all file contents between into the buffer + // TODO: what if without reading the files, user seek to a further place + // hence the buffer is nullptr pointing. and then user set the checkpoint, + // and then seek again? in this case, it should still work? because the refill + // buffer will handle it interanlly while (true) { - // note, although we refill the buffer, we didn't adjust the - // cur_pos; - refill_reader_buffer(cReaderBufferSize); + // keep refilling the buffer + if (auto error_code = refill_reader_buffer(cReaderBufferSize); + ErrorCode_Success != error_code) { + return error_code; + } if (data_read_remaining < cReaderBufferSize) { m_file_pos = pos; m_cursor_pos += front_seek_amount; @@ -281,22 +298,26 @@ void FileReader::mark_pos() { // recent data void FileReader::reset_checkpoint () { - // this basically means we don't allow to reset yet + // we don't allow to reset yet // because currently we are still reading from buffered data + // and we are not at the last "page" of data if (m_size - m_cursor_pos > cReaderBufferSize) { SPDLOG_ERROR("Not ready for reset checkpoint"); + // or we can only remove out of dated buffer, and reclain memory when + // we read the next page throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } if(m_size > cReaderBufferSize) { - int8_t* new_buffer = (int8_t*)malloc(sizeof(int8_t) * cReaderBufferSize); + auto new_buffer = (int8_t*)malloc(sizeof(int8_t) * cReaderBufferSize); // copy the last "page" of data over. size_t copy_pos = m_size - cReaderBufferSize; memcpy(new_buffer, m_buffer + copy_pos, cReaderBufferSize); free(m_read_buffer); m_read_buffer = new_buffer; - // here, we don't need to touch m_cursor_pos yet; + m_size = cReaderBufferSize; m_buffer = new_buffer; + m_cursor_pos -= copy_pos; } m_checkpoint_enabled = false; BufferReader::reset_checkpoint(); @@ -310,6 +331,7 @@ void FileReader::close () { m_fd = -1; if (m_checkpoint_enabled) { + // ADD a debug log message m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); m_checkpoint_enabled = false; } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 2d699f448..fe73ad777 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -128,8 +128,8 @@ class FileReader : public BufferReader { // Buffer specific data int8_t* m_read_buffer; - static constexpr size_t cReaderBufferSize = 65536; - + static constexpr size_t cReaderBufferSize = 1 << 16; + static constexpr size_t cBufferAlignedMask = ~((1 << 16) - 1); // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpointed_pos; diff --git a/components/core/tests/test-FileReader.cpp b/components/core/tests/test-FileReader.cpp new file mode 100644 index 000000000..a93635e38 --- /dev/null +++ b/components/core/tests/test-FileReader.cpp @@ -0,0 +1,211 @@ +// C libraries +#include + +// Boost libraries +#include + +// Catch2 +#include "../submodules/Catch2/single_include/catch2/catch.hpp" + +// Project headers +#include "../src/FileWriter.hpp" +#include "../src/FileReader.hpp" +#include "../src/Utils.hpp" + +TEST_CASE("Test reading data", "[FileReader]") { + ErrorCode error_code; + + // Initialize data for testing + size_t test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 + char* test_data = new char[test_data_size]; + char* read_buffer = new char[test_data_size]; + for (size_t i = 0; i < test_data_size; ++i) { + test_data[i] = (char)('a' + (i % 26)); + } + + std::string test_file_path {"FileReader.test"}; + // write to test file + FileWriter file_writer; + file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); + file_writer.write(test_data, test_data_size); + file_writer.close(); + + SECTION("General read testing") { + FileReader file_reader; + file_reader.open(test_file_path); + size_t num_bytes_read {0}; + size_t buffer_offset {0}; + + // first, read a small chunk of data; + size_t read_size1 {1023}; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size1, + num_bytes_read)); + REQUIRE(read_size1 == num_bytes_read); + buffer_offset += num_bytes_read; + + // second, read a large chunk of data, so + // fileReader will refill the internal buffer + size_t read_size2 {65538}; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size2, + num_bytes_read)); + REQUIRE(read_size2 == num_bytes_read); + buffer_offset += num_bytes_read; + + // third, read remaining data + size_t read_size3 = test_data_size - read_size2 - read_size1; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size3, + num_bytes_read)); + REQUIRE(read_size3 == num_bytes_read); + buffer_offset += num_bytes_read; + + REQUIRE(0 == memcmp(read_buffer, test_data, test_data_size)); + + // lastly, make sure the buffer is drained out + size_t read_size4 = 1; + REQUIRE(ErrorCode_EndOfFile == file_reader.try_read(read_buffer + buffer_offset, + read_size4, num_bytes_read)); + file_reader.close(); + } + + SECTION("Simple Seek without checkpoint") { + FileReader file_reader; + file_reader.open(test_file_path); + + // seek to some random position and do a read + size_t seek_pos {245}; + size_t num_bytes_to_read = 65540; + size_t num_byte_read; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + + // seek front to random position and do a read + seek_pos = 345212; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + + // the seek should fail on a backward seek when checkpoint is not enabled + seek_pos -= 1; + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + } + + SECTION("Simple seek with checkpoint") { + FileReader file_reader; + file_reader.open(test_file_path); + + // first, read some data to proceed the file_pos + size_t num_bytes_to_read = 65540; + size_t num_byte_read; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(file_reader.get_pos() == num_bytes_to_read); + + // set a checkpoint + size_t checkpoint_pos = file_reader.get_pos(); + file_reader.mark_pos(); + + // keep reading some data + num_bytes_to_read = 345212; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + size_t latest_file_pos = checkpoint_pos + num_bytes_to_read; + REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); + + // now seek back to some where between + size_t seek_pos = file_reader.get_pos() / 2; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + // get the latest file_pos + latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); + + + // now try to seek back to an unacceptable place + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); + // try reset, which should fail now. + // REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + + // now go back to latest data + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); + // reset, and then seek back should fail + file_reader.reset_checkpoint(); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + + // make sure data read after checkpoint-reset still matches + num_bytes_to_read = 4096; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read)); + + // Make sure now we can't reset back to checkpoint + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + } + + SECTION("Simple seek with delayed read") { + FileReader file_reader; + file_reader.open(test_file_path); + + // first, read seek to some random file_pos + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(45313)); + + // set a checkpoint + size_t checkpoint_pos = file_reader.get_pos(); + file_reader.mark_pos(); + + // keep reading some data + size_t num_bytes_to_read; + size_t num_byte_read; + + num_bytes_to_read = 345212; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + size_t latest_file_pos = checkpoint_pos + num_bytes_to_read; + REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); + + // now seek back to some where between + size_t seek_pos = file_reader.get_pos() / 2; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + // get the latest file_pos + latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); + + + // now try to seek back to an unacceptable place + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); + // try reset, which should fail now. + // REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + + // now go back to latest data + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); + // reset, and then seek back should fail + file_reader.reset_checkpoint(); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + + // make sure data read after checkpoint-reset still matches + num_bytes_to_read = 4096; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read)); + + // Make sure now we can't reset back to checkpoint + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + } + + SECTION("Reset seek with corner cases") { + + } +} \ No newline at end of file From 38b0da7cf38f065d9375ba03267dfff41716c751 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 11 May 2023 10:27:11 -0400 Subject: [PATCH 016/121] remove eof flag --- components/core/src/BufferReader.cpp | 4 +++ components/core/src/FileReader.cpp | 48 ++++++++++++++-------------- components/core/src/FileReader.hpp | 3 +- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 70bf4a76b..20c8452dd 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -29,6 +29,10 @@ using std::string_view; } ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + // this is not defined by specifications, + // but we need this strong behavior for the upper class + num_bytes_read = 0; + if (nullptr == m_buffer) { return ErrorCode_NotInit; } diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 4060f1648..93ef5d692 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -34,11 +34,10 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { size_t num_bytes_read; if (false == m_checkpoint_enabled) { - num_bytes_read = ::read(m_fd, m_read_buffer, cReaderBufferSize); - // todo: keep reading until you see 0 - if (num_bytes_read < num_bytes_to_read) { - reached_eof = true; + num_bytes_read = ::read(m_fd, m_read_buffer, num_bytes_to_read); + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; } if (num_bytes_read == -1) { return ErrorCode_errno; @@ -46,16 +45,16 @@ ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { reset_buffer(m_read_buffer, num_bytes_read); } else { // increase buffer size - m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + cReaderBufferSize); + m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + num_bytes_to_read); m_buffer = m_read_buffer; - num_bytes_read = ::read(m_fd, m_read_buffer + m_size, cReaderBufferSize); - m_size += cReaderBufferSize; - if (num_bytes_read < num_bytes_to_read) { - reached_eof = true; + num_bytes_read = ::read(m_fd, m_read_buffer + m_size, num_bytes_to_read); + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; } if (num_bytes_read == -1) { return ErrorCode_errno; } + m_size += num_bytes_read; } return ErrorCode_Success; } @@ -70,8 +69,7 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num num_bytes_read = 0; size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; - size_t num_bytes_read_from_buffer; - + size_t num_bytes_read_from_buffer {0}; // keep reading bool finish_reading = false; while (false == finish_reading) { @@ -96,24 +94,30 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num } else { // else, we refill the buffer error_code = refill_reader_buffer(cReaderBufferSize); - if (ErrorCode_Success != error_code) { + // TODO: here is refill_reader_buffer returns eof, we can't simply + // return eof, because we might have already readed some data + if (ErrorCode_EndOfFile == error_code) { + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } else { + finish_reading = true; + } + } + else if (ErrorCode_Success != error_code) { return error_code; } } } else if (ErrorCode_EndOfFile == error_code) { - // if we encounter the end of file, means the buffer - // happens to be drained out and we didn't read any data from it - // in this case, exit if reached_eof, or simply refill the buffer. - if (reached_eof) { + // else, we refill the buffer + error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_EndOfFile == error_code) { if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } else { - return ErrorCode_Success; + finish_reading = true; } } - // else, we refill the buffer - error_code = refill_reader_buffer(cReaderBufferSize); - if (ErrorCode_Success != error_code) { + else if (ErrorCode_Success != error_code) { return error_code; } } else { @@ -218,7 +222,6 @@ ErrorCode FileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - reached_eof = false; // Buffer specific things reset_buffer(nullptr, 0); @@ -251,9 +254,6 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, m_file_pos += cursor - m_cursor_pos; m_cursor_pos = cursor; if (false == found_delim) { - if (reached_eof) { - return ErrorCode_EndOfFile; - } if (auto error_code = refill_reader_buffer(cReaderBufferSize); ErrorCode_Success != error_code) { return error_code; diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index fe73ad777..596d3d215 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -31,7 +31,7 @@ class FileReader : public BufferReader { }; // Constructors - FileReader() : m_file_pos(0), m_fd(-1), reached_eof(false), m_checkpoint_enabled(false) + FileReader() : m_file_pos(0), m_fd(-1), m_checkpoint_enabled(false) { m_read_buffer = reinterpret_cast(malloc(sizeof(int8_t) * cReaderBufferSize)); } @@ -124,7 +124,6 @@ class FileReader : public BufferReader { size_t m_file_pos; int m_fd; std::string m_path; - bool reached_eof; // Buffer specific data int8_t* m_read_buffer; From e663312db7faf787eb9d141a7cdbd2edc503fdb8 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 12 May 2023 16:15:51 -0400 Subject: [PATCH 017/121] remove checkpoint related functions from readerbuffer --- components/core/src/BufferReader.cpp | 23 ++--------------- components/core/src/BufferReader.hpp | 19 +++----------- components/core/src/FileReader.cpp | 8 +++--- components/core/src/FileReader.hpp | 1 + .../src/ffi/ir_stream/decoding_methods.cpp | 25 +++++++++++-------- 5 files changed, 24 insertions(+), 52 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 20c8452dd..55d8a6a0f 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -9,7 +9,7 @@ using std::string_view; [[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { - if (nullptr == m_buffer) { + if (nullptr == m_buffer || 0 == m_size) { return ErrorCode_NotInit; } pos = m_cursor_pos; @@ -17,10 +17,9 @@ using std::string_view; } [[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { - if (nullptr == m_buffer) { + if (nullptr == m_buffer || 0 == m_size) { return ErrorCode_NotInit; } - // TODO: should we throw the error? if (pos > m_size) { return ErrorCode_OutOfBounds; } @@ -50,24 +49,6 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_Success; } -void BufferReader::mark_pos () { - checkpoint_enable = true; - m_checkpoint_pos = m_cursor_pos; -} - -void BufferReader::revert_pos () { - if (false == checkpoint_enable) { - SPDLOG_ERROR("DictionaryWriter ran out of IDs."); - throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); - } - m_cursor_pos = m_checkpoint_pos; -} - -void BufferReader::reset_checkpoint () { - m_checkpoint_pos = 0; - checkpoint_enable = false; -} - bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size) { if ((m_cursor_pos + read_size) > m_size) { return false; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 17a1b1e74..235467d49 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -25,32 +25,23 @@ class BufferReader : public ReaderInterface { }; BufferReader () : m_buffer(nullptr), m_size(0), - m_cursor_pos(0), - m_checkpoint_pos(0), - checkpoint_enable(false) {} + m_cursor_pos(0) {} BufferReader (const int8_t* data, size_t size) : m_buffer(data), m_size(size), - m_cursor_pos(0), - m_checkpoint_pos(0), - checkpoint_enable(false) {} + m_cursor_pos(0) {} [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; - + [[nodiscard]] size_t get_buffer_length() { return m_size; } void reset_buffer (const int8_t* data, size_t size) { m_buffer = data; m_size = size; m_cursor_pos = 0; } - // The following methods should only be used by the decoder - virtual void mark_pos (); - virtual void revert_pos (); - virtual void reset_checkpoint (); - /** * Tries reading a string view of size = read_size from the ir_buf. * @param str_view Returns the string view @@ -65,10 +56,6 @@ class BufferReader : public ReaderInterface { const int8_t* m_buffer; size_t m_size; size_t m_cursor_pos; - -private: - bool checkpoint_enable; - size_t m_checkpoint_pos; }; diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 93ef5d692..3adfde4d1 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -202,8 +202,7 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Failure; } m_file_pos = pos; - BufferReader::revert_pos(); - m_cursor_pos += pos - m_checkpointed_pos; + m_cursor_pos = m_checkpointed_buffer_pos + (pos - m_checkpointed_pos); } return ErrorCode_Success; } @@ -281,7 +280,7 @@ void FileReader::revert_pos() { } m_file_pos = m_checkpointed_pos; // this should have revert the pos to the original buffer pos - BufferReader::revert_pos(); + m_cursor_pos = m_checkpointed_buffer_pos; } void FileReader::mark_pos() { @@ -291,7 +290,7 @@ void FileReader::mark_pos() { } m_checkpointed_pos = m_file_pos; m_checkpoint_enabled = true; - BufferReader::mark_pos(); + m_checkpointed_buffer_pos = m_cursor_pos; } // let's assume the checkpoint can only be reset if we are already reading @@ -320,7 +319,6 @@ void FileReader::reset_checkpoint () { m_cursor_pos -= copy_pos; } m_checkpoint_enabled = false; - BufferReader::reset_checkpoint(); } void FileReader::close () { diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 596d3d215..72a0cf00a 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -132,6 +132,7 @@ class FileReader : public BufferReader { // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpointed_pos; + size_t m_checkpointed_buffer_pos; }; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 5680cb5f8..ccf511618 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -463,13 +463,16 @@ namespace ffi::ir_stream { } IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding) { - ir_buf.mark_pos(); + if (ir_buf.get_buffer_length() == 0) { + return IRErrorCode_Incomplete_IR; + } + size_t seekback_pos = ir_buf.get_pos(); char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); if (error_code != ErrorCode_Success || num_bytes_read != cProtocol::MagicNumberLength) { - ir_buf.revert_pos(); + ir_buf.seek_from_begin(seekback_pos); return IRErrorCode_Incomplete_IR; } if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, @@ -479,26 +482,25 @@ namespace ffi::ir_stream { cProtocol::MagicNumberLength)) { is_four_bytes_encoding = false; } else { - ir_buf.revert_pos(); + ir_buf.seek_from_begin(seekback_pos); return IRErrorCode_Corrupted_IR; } return IRErrorCode_Success; } - + IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { - ir_buf.mark_pos(); - + size_t marked_pos = ir_buf.get_pos(); if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); error_code != IRErrorCode_Success) { - ir_buf.revert_pos(); + ir_buf.seek_from_begin(marked_pos); return error_code; } metadata_pos = ir_buf.get_pos(); //TODO: this might not be optimal if (ErrorCode_Success != ir_buf.try_seek_from_begin(metadata_pos + metadata_size)) { - ir_buf.revert_pos(); + ir_buf.seek_from_begin(marked_pos); return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; @@ -518,12 +520,15 @@ namespace ffi::ir_stream { IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp) { - ir_buf.mark_pos(); + if (ir_buf.get_buffer_length() == 0) { + return IRErrorCode_Incomplete_IR; + } + size_t marked_pos = ir_buf.get_pos(); auto error_code = generic_decode_next_message( ir_buf, message, timestamp ); if (IRErrorCode_Success != error_code) { - ir_buf.revert_pos(); + ir_buf.seek_from_begin(marked_pos); } return error_code; } From fd02a5f4e6f44f07938f51d3c599bc6fb21c66be Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 13 May 2023 15:34:35 -0400 Subject: [PATCH 018/121] refactor try_read logic --- components/core/src/FileReader.cpp | 89 +++++++++++++++--------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 3adfde4d1..f42c42952 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -30,29 +30,47 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } -//TODO: return number of bytes I readed to avoid eof check +static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, + size_t& num_bytes_read) { + num_bytes_read = 0; + while (true) { + auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); + if (bytes_read == -1) { + return ErrorCode_errno; + } + if (bytes_read == 0) { + break; + } + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_to_read == 0) { + return ErrorCode_Success; + } + } + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; +} + + ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { - size_t num_bytes_read; + size_t num_bytes_read {0}; if (false == m_checkpoint_enabled) { - // todo: keep reading until you see 0 - num_bytes_read = ::read(m_fd, m_read_buffer, num_bytes_to_read); - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - if (num_bytes_read == -1) { - return ErrorCode_errno; + auto error_code = try_read_into_buffer(m_fd, m_read_buffer, + num_bytes_to_read, num_bytes_read); + if (error_code != ErrorCode_Success) { + return error_code; } reset_buffer(m_read_buffer, num_bytes_read); } else { // increase buffer size m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + num_bytes_to_read); m_buffer = m_read_buffer; - num_bytes_read = ::read(m_fd, m_read_buffer + m_size, num_bytes_to_read); - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - if (num_bytes_read == -1) { - return ErrorCode_errno; + auto error_code = try_read_into_buffer(m_fd, m_read_buffer + m_size, num_bytes_to_read, + num_bytes_read); + if (error_code != ErrorCode_Success) { + return error_code; } m_size += num_bytes_read; } @@ -71,58 +89,37 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; size_t num_bytes_read_from_buffer {0}; // keep reading - bool finish_reading = false; - while (false == finish_reading) { + while (true) { auto error_code = BufferReader::try_read(buf + num_bytes_read, num_bytes_to_read_from_buffer, num_bytes_read_from_buffer); - // what if we make the error to be eof - if (ErrorCode_NotInit == error_code) { - // else, we refill the buffer - // TODO: we can do something special about the curpos - // if we know only two paths will lead to this place. - error_code = refill_reader_buffer(cReaderBufferSize); - if (ErrorCode_Success != error_code) { - return error_code; - } - } else if (ErrorCode_Success == error_code) { + if (ErrorCode_Success == error_code || + ErrorCode_EndOfFile == error_code || + ErrorCode_NotInit == error_code) + { m_file_pos += num_bytes_read_from_buffer; num_bytes_read += num_bytes_read_from_buffer; num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; if (num_bytes_to_read_from_buffer == 0) { - finish_reading = true; + break; } else { // else, we refill the buffer error_code = refill_reader_buffer(cReaderBufferSize); - // TODO: here is refill_reader_buffer returns eof, we can't simply + // TODO: here if refill_reader_buffer returns eof, we can't simply // return eof, because we might have already readed some data if (ErrorCode_EndOfFile == error_code) { if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } else { - finish_reading = true; + break; } } else if (ErrorCode_Success != error_code) { return error_code; } } - } else if (ErrorCode_EndOfFile == error_code) { - // else, we refill the buffer - error_code = refill_reader_buffer(cReaderBufferSize); - if (ErrorCode_EndOfFile == error_code) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } else { - finish_reading = true; - } - } - else if (ErrorCode_Success != error_code) { - return error_code; - } } else { - // else some unexpected error code is encountered. - throw OperationFailed(error_code, __FILENAME__, __LINE__); + return error_code; } } return ErrorCode_Success; @@ -329,7 +326,7 @@ void FileReader::close () { m_fd = -1; if (m_checkpoint_enabled) { - // ADD a debug log message + // TODO: add a debug log message m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); m_checkpoint_enabled = false; } From 237bb3f0feb83bdc847479a14d0b24f2d7eb6fbf Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 13 May 2023 17:37:30 -0400 Subject: [PATCH 019/121] clean up --- components/core/src/FileReader.cpp | 114 +++++++++++++---------------- components/core/src/FileReader.hpp | 3 +- 2 files changed, 54 insertions(+), 63 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index f42c42952..3910f7a81 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -33,6 +33,7 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; + // keep reading from the fd until seeing a 0 while (true) { auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); if (bytes_read == -1) { @@ -53,9 +54,13 @@ static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_t return ErrorCode_Success; } - ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { - size_t num_bytes_read {0}; + size_t num_bytes_read; + return refill_reader_buffer (num_bytes_to_read, num_bytes_read); +} + +ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read, size_t& num_bytes_read) { + num_bytes_read = 0; if (false == m_checkpoint_enabled) { auto error_code = try_read_into_buffer(m_fd, m_read_buffer, num_bytes_to_read, num_bytes_read); @@ -95,29 +100,25 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num num_bytes_read_from_buffer); if (ErrorCode_Success == error_code || ErrorCode_EndOfFile == error_code || - ErrorCode_NotInit == error_code) - { + ErrorCode_NotInit == error_code) { m_file_pos += num_bytes_read_from_buffer; num_bytes_read += num_bytes_read_from_buffer; num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; if (num_bytes_to_read_from_buffer == 0) { break; - } else { - // else, we refill the buffer - error_code = refill_reader_buffer(cReaderBufferSize); - // TODO: here if refill_reader_buffer returns eof, we can't simply - // return eof, because we might have already readed some data - if (ErrorCode_EndOfFile == error_code) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } else { - break; - } - } - else if (ErrorCode_Success != error_code) { - return error_code; + } + // refill the buffer if more bytes are to be read + error_code = refill_reader_buffer(cReaderBufferSize); + if (ErrorCode_EndOfFile == error_code) { + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } else { + break; } } + else if (ErrorCode_Success != error_code) { + return error_code; + } } else { return error_code; } @@ -135,11 +136,26 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } - if (pos > m_file_pos) { - auto front_seek_amount = pos - m_file_pos; - if (front_seek_amount > m_size - m_cursor_pos) { - if (m_checkpoint_enabled == false) { - // let's assume we want the read to be always page aligned + if (pos <= m_file_pos) { + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); + return ErrorCode_Failure; + } + if (pos < m_checkpointed_pos) { + SPDLOG_ERROR("Seek back before the checkpoint is not supported"); + return ErrorCode_Failure; + } + m_cursor_pos -= (m_file_pos - pos); + m_file_pos = pos; + } else { + auto seek_distance = pos - m_file_pos; + if (seek_distance <= m_size - m_cursor_pos) { + // we can simply seek in the same buffer; + m_cursor_pos += seek_distance; + m_file_pos = pos; + } else { + if (false == m_checkpoint_enabled) { + // let's assume we want the read to be always page or buffer aligned auto buffer_aligned_pos = pos & cBufferAlignedMask; auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); if (offset == -1) { @@ -154,52 +170,27 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { m_file_pos = pos; m_cursor_pos = pos - buffer_aligned_pos; } else { - // Get the file size - struct stat fileInfo; - fstat(m_fd, &fileInfo); - off_t file_size = fileInfo.st_size; - if (pos > file_size) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); - throw; - } - - size_t data_read_remaining = front_seek_amount; - // we want to load all file contents between into the buffer - // TODO: what if without reading the files, user seek to a further place - // hence the buffer is nullptr pointing. and then user set the checkpoint, - // and then seek again? in this case, it should still work? because the refill - // buffer will handle it interanlly + size_t data_read_remaining = seek_distance; + size_t num_bytes_refilled; while (true) { // keep refilling the buffer - if (auto error_code = refill_reader_buffer(cReaderBufferSize); - ErrorCode_Success != error_code) { + auto error_code = refill_reader_buffer(cReaderBufferSize, num_bytes_refilled); + if (ErrorCode_EndOfFile == error_code) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw; + } + else if (ErrorCode_Success != error_code) { return error_code; } - if (data_read_remaining < cReaderBufferSize) { + if (data_read_remaining <= num_bytes_refilled) { m_file_pos = pos; - m_cursor_pos += front_seek_amount; - // then we are done. + m_cursor_pos += seek_distance; break; } data_read_remaining -= cReaderBufferSize; } } - } else { - // otherwise, we can simply seek in the same buffer; - m_cursor_pos += front_seek_amount; - m_file_pos = pos; - } - } else { - if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); - return ErrorCode_Failure; - } - if (pos < m_checkpointed_pos) { - SPDLOG_ERROR("Seek back before the checkpoint is not supported"); - return ErrorCode_Failure; } - m_file_pos = pos; - m_cursor_pos = m_checkpointed_buffer_pos + (pos - m_checkpointed_pos); } return ErrorCode_Success; } @@ -218,12 +209,11 @@ ErrorCode FileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - - // Buffer specific things reset_buffer(nullptr, 0); return ErrorCode_Success; } +// TODO: optimize this a bit? ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { assert(-1 != m_fd); @@ -275,8 +265,8 @@ void FileReader::revert_pos() { SPDLOG_ERROR("Checkpoint is not enabled"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - m_file_pos = m_checkpointed_pos; // this should have revert the pos to the original buffer pos + m_file_pos = m_checkpointed_pos; m_cursor_pos = m_checkpointed_buffer_pos; } @@ -286,8 +276,8 @@ void FileReader::mark_pos() { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_checkpointed_pos = m_file_pos; - m_checkpoint_enabled = true; m_checkpointed_buffer_pos = m_cursor_pos; + m_checkpoint_enabled = true; } // let's assume the checkpoint can only be reset if we are already reading diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 72a0cf00a..c1ded28f1 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -117,8 +117,8 @@ class FileReader : public BufferReader { void reset_checkpoint (); private: - ErrorCode refill_reader_buffer(size_t num_bytes_to_read); + ErrorCode refill_reader_buffer(size_t num_bytes_to_read, size_t& num_bytes_read); // Types size_t m_file_pos; @@ -126,6 +126,7 @@ class FileReader : public BufferReader { std::string m_path; // Buffer specific data + // TODO: either turn this into a unique ptr, or at least use new & delete int8_t* m_read_buffer; static constexpr size_t cReaderBufferSize = 1 << 16; static constexpr size_t cBufferAlignedMask = ~((1 << 16) - 1); From 6cd7516d041b7c96d12c314b67fc334d4ea8442c Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 13 May 2023 21:38:38 -0400 Subject: [PATCH 020/121] Allow more flexible reset and delay buffer loading for seek --- components/core/src/FileReader.cpp | 65 +++++++++++------------ components/core/src/FileReader.hpp | 11 ++-- components/core/tests/test-FileReader.cpp | 13 ++--- 3 files changed, 43 insertions(+), 46 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 3910f7a81..1a9d6e7aa 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -54,25 +54,30 @@ static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_t return ErrorCode_Success; } -ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read) { +ErrorCode FileReader::refill_reader_buffer () { size_t num_bytes_read; - return refill_reader_buffer (num_bytes_to_read, num_bytes_read); + return refill_reader_buffer (num_bytes_read); } -ErrorCode FileReader::refill_reader_buffer (size_t num_bytes_to_read, size_t& num_bytes_read) { +ErrorCode FileReader::refill_reader_buffer (size_t& num_bytes_read) { num_bytes_read = 0; if (false == m_checkpoint_enabled) { + // recover from a previous reset + if (m_size > cReaderBufferSize) { + m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); + } auto error_code = try_read_into_buffer(m_fd, m_read_buffer, - num_bytes_to_read, num_bytes_read); + cReaderBufferSize, num_bytes_read); if (error_code != ErrorCode_Success) { return error_code; } - reset_buffer(m_read_buffer, num_bytes_read); + m_size = num_bytes_read; + m_cursor_pos = m_file_pos & cCursorMask; } else { // increase buffer size - m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + num_bytes_to_read); + m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + cReaderBufferSize); m_buffer = m_read_buffer; - auto error_code = try_read_into_buffer(m_fd, m_read_buffer + m_size, num_bytes_to_read, + auto error_code = try_read_into_buffer(m_fd, m_read_buffer + m_size, cReaderBufferSize, num_bytes_read); if (error_code != ErrorCode_Success) { return error_code; @@ -108,7 +113,7 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num break; } // refill the buffer if more bytes are to be read - error_code = refill_reader_buffer(cReaderBufferSize); + error_code = refill_reader_buffer(); if (ErrorCode_EndOfFile == error_code) { if (num_bytes_read == 0) { return ErrorCode_EndOfFile; @@ -161,20 +166,16 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { if (offset == -1) { return ErrorCode_errno; } - // now the issue is that: if we want to delay buffer loading - // how can we propogate the right m_cursor_pos - if (auto error_code = refill_reader_buffer(cReaderBufferSize); - ErrorCode_Success != error_code) { - return error_code; - } + m_size = 0; m_file_pos = pos; - m_cursor_pos = pos - buffer_aligned_pos; + // TODO: This line is needed in case + m_cursor_pos = m_file_pos & cCursorMask; } else { size_t data_read_remaining = seek_distance; size_t num_bytes_refilled; while (true) { // keep refilling the buffer - auto error_code = refill_reader_buffer(cReaderBufferSize, num_bytes_refilled); + auto error_code = refill_reader_buffer(num_bytes_refilled); if (ErrorCode_EndOfFile == error_code) { SPDLOG_ERROR("not expecting to seek pass the Entire file"); throw; @@ -209,7 +210,7 @@ ErrorCode FileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - reset_buffer(nullptr, 0); + reset_buffer(m_read_buffer, 0); return ErrorCode_Success; } @@ -240,7 +241,7 @@ ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, m_file_pos += cursor - m_cursor_pos; m_cursor_pos = cursor; if (false == found_delim) { - if (auto error_code = refill_reader_buffer(cReaderBufferSize); + if (auto error_code = refill_reader_buffer(); ErrorCode_Success != error_code) { return error_code; } @@ -283,27 +284,23 @@ void FileReader::mark_pos() { // let's assume the checkpoint can only be reset if we are already reading // recent data void FileReader::reset_checkpoint () { - - // we don't allow to reset yet - // because currently we are still reading from buffered data - // and we are not at the last "page" of data - if (m_size - m_cursor_pos > cReaderBufferSize) { - SPDLOG_ERROR("Not ready for reset checkpoint"); - // or we can only remove out of dated buffer, and reclain memory when - // we read the next page - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + // alternatively, we can keep claiming back the memory + if (false == m_checkpoint_enabled) { + return; } - if(m_size > cReaderBufferSize) { - auto new_buffer = (int8_t*)malloc(sizeof(int8_t) * cReaderBufferSize); - // copy the last "page" of data over. - size_t copy_pos = m_size - cReaderBufferSize; - memcpy(new_buffer, m_buffer + copy_pos, cReaderBufferSize); + if (m_size != cReaderBufferSize) { + auto buffer_aligned_copy_pos = m_cursor_pos & cBufferAlignedMask; + auto remaining_data_size = m_size - buffer_aligned_copy_pos; + auto buffer_quantized_size = (1 + ((remaining_data_size - 1) >> cBufferExp)) << cBufferExp; + auto new_buffer = (int8_t*)malloc(sizeof(int8_t) * buffer_quantized_size); + + memcpy(new_buffer, m_buffer + buffer_aligned_copy_pos, remaining_data_size); free(m_read_buffer); m_read_buffer = new_buffer; - m_size = cReaderBufferSize; + m_size = remaining_data_size; m_buffer = new_buffer; - m_cursor_pos -= copy_pos; + m_cursor_pos -= buffer_aligned_copy_pos; } m_checkpoint_enabled = false; } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index c1ded28f1..01825cc7a 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -117,8 +117,8 @@ class FileReader : public BufferReader { void reset_checkpoint (); private: - ErrorCode refill_reader_buffer(size_t num_bytes_to_read); - ErrorCode refill_reader_buffer(size_t num_bytes_to_read, size_t& num_bytes_read); + ErrorCode refill_reader_buffer(); + ErrorCode refill_reader_buffer(size_t& num_bytes_read); // Types size_t m_file_pos; @@ -128,8 +128,11 @@ class FileReader : public BufferReader { // Buffer specific data // TODO: either turn this into a unique ptr, or at least use new & delete int8_t* m_read_buffer; - static constexpr size_t cReaderBufferSize = 1 << 16; - static constexpr size_t cBufferAlignedMask = ~((1 << 16) - 1); + int8_t* m_buffer_begin_pos; + static constexpr size_t cBufferExp = 16; + static constexpr size_t cReaderBufferSize = 1 << cBufferExp; + static constexpr size_t cBufferAlignedMask = ~(cReaderBufferSize - 1); + static constexpr size_t cCursorMask = cReaderBufferSize - 1; // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpointed_pos; diff --git a/components/core/tests/test-FileReader.cpp b/components/core/tests/test-FileReader.cpp index a93635e38..4a473f5bd 100644 --- a/components/core/tests/test-FileReader.cpp +++ b/components/core/tests/test-FileReader.cpp @@ -184,25 +184,22 @@ TEST_CASE("Test reading data", "[FileReader]") { // now try to seek back to an unacceptable place REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); - // try reset, which should fail now. - // REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + // try reset, which should success now. + file_reader.reset_checkpoint(); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); // now go back to latest data REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); - // reset, and then seek back should fail - file_reader.reset_checkpoint(); - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); // make sure data read after checkpoint-reset still matches - num_bytes_to_read = 4096; + num_bytes_to_read = 65536; REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read)); // Make sure now we can't reset back to checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(latest_file_pos)); } SECTION("Reset seek with corner cases") { From eacad4a53ab23150b6c07c928d77b2af1501cb17 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 14 May 2023 16:03:15 -0400 Subject: [PATCH 021/121] refactor --- components/core/src/FileReader.cpp | 219 ++++++++++++++++++----------- components/core/src/FileReader.hpp | 35 +++-- 2 files changed, 149 insertions(+), 105 deletions(-) diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 1a9d6e7aa..9ee9284a3 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -14,18 +14,29 @@ // Project headers #include +using std::make_unique; +using std::move; using std::string; +FileReader::FileReader () { + m_file_pos = 0; + m_fd = -1; + m_checkpoint_enabled = 0; + constexpr size_t DefaultBufferSize = 65536; + if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { + throw "Failed to init reader buffer size\n"; + } + m_read_buffer = make_unique(m_reader_buffer_size); +} + FileReader::~FileReader () { close(); - free(m_read_buffer); } ErrorCode FileReader::try_get_pos (size_t& pos) { if (-1 == m_fd) { return ErrorCode_NotInit; } - pos = m_file_pos; return ErrorCode_Success; } @@ -33,7 +44,7 @@ ErrorCode FileReader::try_get_pos (size_t& pos) { static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; - // keep reading from the fd until seeing a 0 + // keep reading from the fd until seeing a 0, which means eof while (true) { auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); if (bytes_read == -1) { @@ -54,35 +65,45 @@ static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_t return ErrorCode_Success; } -ErrorCode FileReader::refill_reader_buffer () { - size_t num_bytes_read; - return refill_reader_buffer (num_bytes_read); +ErrorCode FileReader::refill_reader_buffer (size_t refill_size) { + size_t num_bytes_refilled; + return refill_reader_buffer (refill_size, num_bytes_refilled); } -ErrorCode FileReader::refill_reader_buffer (size_t& num_bytes_read) { - num_bytes_read = 0; +ErrorCode FileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { + num_bytes_refilled = 0; if (false == m_checkpoint_enabled) { // recover from a previous reset - if (m_size > cReaderBufferSize) { - m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); + if (m_size > refill_size) { + m_read_buffer = make_unique(refill_size); + m_buffer = m_read_buffer.get(); } - auto error_code = try_read_into_buffer(m_fd, m_read_buffer, - cReaderBufferSize, num_bytes_read); + auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get(), + refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_size = num_bytes_read; - m_cursor_pos = m_file_pos & cCursorMask; + m_size = num_bytes_refilled; + m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; } else { - // increase buffer size - m_read_buffer = (int8_t*)realloc(m_read_buffer, m_size + cReaderBufferSize); - m_buffer = m_read_buffer; - auto error_code = try_read_into_buffer(m_fd, m_read_buffer + m_size, cReaderBufferSize, - num_bytes_read); + if (m_size == 0) { + // if buffer is uninitialized, reset the cursor_pos + // in case it's after a seek + m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + } + + // Messy way of copying data from old buffer to new buffer + auto new_buffer = make_unique(m_size + refill_size); + memcpy(new_buffer.get(), m_read_buffer.get(), m_size); + m_read_buffer = std::move(new_buffer); + m_buffer = m_read_buffer.get(); + + auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get() + m_size, refill_size, + num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_size += num_bytes_read; + m_size += num_bytes_refilled; } return ErrorCode_Success; } @@ -98,7 +119,7 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num num_bytes_read = 0; size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; size_t num_bytes_read_from_buffer {0}; - // keep reading + // keep reading until enough data is read or an eof is seen while (true) { auto error_code = BufferReader::try_read(buf + num_bytes_read, num_bytes_to_read_from_buffer, @@ -113,7 +134,7 @@ ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num break; } // refill the buffer if more bytes are to be read - error_code = refill_reader_buffer(); + error_code = refill_reader_buffer(m_reader_buffer_size); if (ErrorCode_EndOfFile == error_code) { if (num_bytes_read == 0) { return ErrorCode_EndOfFile; @@ -153,43 +174,38 @@ ErrorCode FileReader::try_seek_from_begin (size_t pos) { m_cursor_pos -= (m_file_pos - pos); m_file_pos = pos; } else { + auto buffer_available_data = m_size - m_cursor_pos; auto seek_distance = pos - m_file_pos; - if (seek_distance <= m_size - m_cursor_pos) { - // we can simply seek in the same buffer; + if (seek_distance <= buffer_available_data) { m_cursor_pos += seek_distance; m_file_pos = pos; + } else if (false == m_checkpoint_enabled) { + auto buffer_aligned_pos = pos & m_reader_buffer_aligned_mask; + auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); + if (offset == -1) { + return ErrorCode_errno; + } + // invalidate buffered_data by setting size to 0 + m_size = 0; + m_file_pos = pos; } else { - if (false == m_checkpoint_enabled) { - // let's assume we want the read to be always page or buffer aligned - auto buffer_aligned_pos = pos & cBufferAlignedMask; - auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); - if (offset == -1) { - return ErrorCode_errno; + size_t num_bytes_to_refill = seek_distance - buffer_available_data; + size_t num_bytes_refilled {0}; + while (true) { + auto error_code = refill_reader_buffer(m_reader_buffer_size, num_bytes_refilled); + if (ErrorCode_EndOfFile == error_code) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw; } - m_size = 0; - m_file_pos = pos; - // TODO: This line is needed in case - m_cursor_pos = m_file_pos & cCursorMask; - } else { - size_t data_read_remaining = seek_distance; - size_t num_bytes_refilled; - while (true) { - // keep refilling the buffer - auto error_code = refill_reader_buffer(num_bytes_refilled); - if (ErrorCode_EndOfFile == error_code) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); - throw; - } - else if (ErrorCode_Success != error_code) { - return error_code; - } - if (data_read_remaining <= num_bytes_refilled) { - m_file_pos = pos; - m_cursor_pos += seek_distance; - break; - } - data_read_remaining -= cReaderBufferSize; + else if (ErrorCode_Success != error_code) { + return error_code; } + if (num_bytes_to_refill <= m_reader_buffer_size) { + m_file_pos = pos; + m_cursor_pos += seek_distance; + break; + } + num_bytes_to_refill -= num_bytes_refilled; } } } @@ -210,38 +226,43 @@ ErrorCode FileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - reset_buffer(m_read_buffer, 0); + reset_buffer(m_read_buffer.get(), 0); return ErrorCode_Success; } -// TODO: optimize this a bit? ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { - assert(-1 != m_fd); + if (-1 == m_fd) { + return ErrorCode_NotInit; + } if (false == append) { str.clear(); } bool found_delim {false}; - + size_t delim_pos; while (false == found_delim) { - auto cursor {m_cursor_pos}; - while (cursor < m_size && false == found_delim) { - if (delim == m_buffer[cursor]) { - found_delim = true; - } - cursor++; + auto remaining_data_size = m_size - m_cursor_pos; + // find the pointer pointing to the delimiter + const auto* delim_ptr = reinterpret_cast(memchr(m_buffer + m_cursor_pos, + delim, remaining_data_size)); + if (delim_ptr != nullptr) { + delim_pos = (delim_ptr - m_buffer) + 1; + found_delim = true; + } else { + delim_pos = m_size; } // append to strings + size_t copy_length = delim_pos - m_cursor_pos; std::string_view substr(reinterpret_cast(m_buffer + m_cursor_pos), - cursor - m_cursor_pos); + copy_length); str.append(substr); - // increase file pos - m_file_pos += cursor - m_cursor_pos; - m_cursor_pos = cursor; + // increment file pos to the delimiter or the end of file + m_file_pos += copy_length; + m_cursor_pos = delim_pos; if (false == found_delim) { - if (auto error_code = refill_reader_buffer(); + if (auto error_code = refill_reader_buffer(m_reader_buffer_size); ErrorCode_Success != error_code) { return error_code; } @@ -266,7 +287,6 @@ void FileReader::revert_pos() { SPDLOG_ERROR("Checkpoint is not enabled"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - // this should have revert the pos to the original buffer pos m_file_pos = m_checkpointed_pos; m_cursor_pos = m_checkpointed_buffer_pos; } @@ -281,30 +301,57 @@ void FileReader::mark_pos() { m_checkpoint_enabled = true; } -// let's assume the checkpoint can only be reset if we are already reading -// recent data void FileReader::reset_checkpoint () { // alternatively, we can keep claiming back the memory if (false == m_checkpoint_enabled) { return; } - if (m_size != cReaderBufferSize) { - auto buffer_aligned_copy_pos = m_cursor_pos & cBufferAlignedMask; - auto remaining_data_size = m_size - buffer_aligned_copy_pos; - auto buffer_quantized_size = (1 + ((remaining_data_size - 1) >> cBufferExp)) << cBufferExp; - auto new_buffer = (int8_t*)malloc(sizeof(int8_t) * buffer_quantized_size); - - memcpy(new_buffer, m_buffer + buffer_aligned_copy_pos, remaining_data_size); - free(m_read_buffer); - m_read_buffer = new_buffer; - - m_size = remaining_data_size; - m_buffer = new_buffer; - m_cursor_pos -= buffer_aligned_copy_pos; + if (m_size != m_reader_buffer_size) { + // allocate new buffer for buffered data that hasn't been seek passed + auto copy_pos = m_cursor_pos & m_reader_buffer_aligned_mask; + auto copy_size = m_size - copy_pos; + // Use a quantized size for the new buffer size + auto new_buffer_size = (1 + ((copy_size - 1) >> m_reader_buffer_exp)) + << m_reader_buffer_exp; + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), m_buffer + copy_pos, copy_size); + m_read_buffer = std::move(new_buffer); + m_buffer = m_read_buffer.get(); + + m_size = copy_size; + m_cursor_pos -= copy_pos; } m_checkpoint_enabled = false; } +ErrorCode FileReader::set_buffer_size (size_t buffer_size) { + if (m_fd != -1) { + SPDLOG_ERROR("Buffer size can not be changed when the file is open"); + return ErrorCode_Failure; + } + if (buffer_size == 0) { + SPDLOG_ERROR("Buffer size can not be set to 0"); + return ErrorCode_BadParam; + } + if (buffer_size % 4096 != 0) { + SPDLOG_ERROR("Buffer size {} is not a multiple of page size", buffer_size); + return ErrorCode_BadParam; + } + // Calculate the logarithm base 2 of the number + double exponent = log(buffer_size) / log(2); + if (ceil(exponent) != floor(exponent)) { + SPDLOG_ERROR("Buffer size {} is not a power of 2", buffer_size); + return ErrorCode_BadParam; + } + + m_reader_buffer_exp = static_cast(exponent); + m_reader_buffer_size = buffer_size; + m_reader_buffer_aligned_mask = ~(m_reader_buffer_size - 1); + m_reader_buffer_cursor_mask = m_reader_buffer_size - 1; + return ErrorCode_Success; +} + void FileReader::close () { if (-1 != m_fd) { // NOTE: We don't check errors for fclose since it seems @@ -313,8 +360,8 @@ void FileReader::close () { m_fd = -1; if (m_checkpoint_enabled) { - // TODO: add a debug log message - m_read_buffer = (int8_t*)realloc(m_read_buffer, cReaderBufferSize); + SPDLOG_DEBUG("close file without resetting checkpoint"); + m_read_buffer = make_unique(m_reader_buffer_size); m_checkpoint_enabled = false; } } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 01825cc7a..2b8844c04 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -31,10 +31,7 @@ class FileReader : public BufferReader { }; // Constructors - FileReader() : m_file_pos(0), m_fd(-1), m_checkpoint_enabled(false) - { - m_read_buffer = reinterpret_cast(malloc(sizeof(int8_t) * cReaderBufferSize)); - } + FileReader(); ~FileReader(); // Methods implementing the ReaderInterface /** @@ -44,7 +41,7 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - ErrorCode try_get_pos (size_t& pos) override; + [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; /** * Tries to seek from the beginning of the file to the given position * @param pos @@ -52,7 +49,7 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - ErrorCode try_seek_from_begin (size_t pos) override; + [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; /** * Tries to read up to a given number of bytes from the file @@ -65,7 +62,8 @@ class FileReader : public BufferReader { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Success on success */ - ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) override; /** * Tries to read a string from the file until it reaches the specified delimiter @@ -77,8 +75,8 @@ class FileReader : public BufferReader { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_errno otherwise */ - ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, std::string& str) override; + [[nodiscard]] ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, std::string& str) override; // Methods [[nodiscard]] bool is_open () const { return -1 != m_fd; } @@ -90,7 +88,7 @@ class FileReader : public BufferReader { * @return ErrorCode_FileNotFound if the file was not found * @return ErrorCode_errno otherwise */ - ErrorCode try_open (const std::string& path); + [[nodiscard]] ErrorCode try_open (const std::string& path); /** * Opens a file * @param path @@ -115,10 +113,11 @@ class FileReader : public BufferReader { void mark_pos(); void revert_pos(); void reset_checkpoint (); + [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); private: - ErrorCode refill_reader_buffer(); - ErrorCode refill_reader_buffer(size_t& num_bytes_read); + [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); + [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); // Types size_t m_file_pos; @@ -126,13 +125,11 @@ class FileReader : public BufferReader { std::string m_path; // Buffer specific data - // TODO: either turn this into a unique ptr, or at least use new & delete - int8_t* m_read_buffer; - int8_t* m_buffer_begin_pos; - static constexpr size_t cBufferExp = 16; - static constexpr size_t cReaderBufferSize = 1 << cBufferExp; - static constexpr size_t cBufferAlignedMask = ~(cReaderBufferSize - 1); - static constexpr size_t cCursorMask = cReaderBufferSize - 1; + std::unique_ptr m_read_buffer; + size_t m_reader_buffer_exp; + size_t m_reader_buffer_size; + size_t m_reader_buffer_aligned_mask; + size_t m_reader_buffer_cursor_mask; // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpointed_pos; From dc0d91e5dbfffbe17d0786392359c29f9898fdb3 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 14 May 2023 17:56:06 -0400 Subject: [PATCH 022/121] rename new file reader to something else --- components/core/CMakeLists.txt | 7 +- components/core/src/BufferedFileReader.cpp | 380 ++++++++++++++++++ components/core/src/BufferedFileReader.hpp | 140 +++++++ components/core/src/FileReader.cpp | 352 +++------------- components/core/src/FileReader.hpp | 67 +-- components/core/src/dictionary_utils.cpp | 4 - ...Reader.cpp => test-BufferedFileReader.cpp} | 16 +- 7 files changed, 608 insertions(+), 358 deletions(-) create mode 100644 components/core/src/BufferedFileReader.cpp create mode 100644 components/core/src/BufferedFileReader.hpp rename components/core/tests/{test-FileReader.cpp => test-BufferedFileReader.cpp} (95%) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 4f956a018..51c83a75b 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -630,6 +630,10 @@ target_compile_features(clo ) set(SOURCE_FILES_unitTest + src/BufferReader.cpp + src/BufferReader.hpp + src/BufferedFileReader.cpp + src/BufferedFileReader.hpp src/clp/CommandLineArguments.cpp src/clp/CommandLineArguments.hpp src/clp/compression.cpp @@ -817,6 +821,7 @@ set(SOURCE_FILES_unitTest submodules/sqlite3/sqlite3.c submodules/sqlite3/sqlite3.h submodules/sqlite3/sqlite3ext.h + tests/test-BufferedFileReader.cpp tests/test-EncodedVariableInterpreter.cpp tests/test-encoding_methods.cpp tests/test-Grep.cpp @@ -830,8 +835,6 @@ set(SOURCE_FILES_unitTest tests/test-string_utils.cpp tests/test-TimestampPattern.cpp tests/test-Utils.cpp - src/BufferReader.cpp - src/BufferReader.hpp ) add_executable(unitTest ${SOURCE_FILES_unitTest}) target_link_libraries(unitTest diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp new file mode 100644 index 000000000..bf0ce74af --- /dev/null +++ b/components/core/src/BufferedFileReader.cpp @@ -0,0 +1,380 @@ +#include "BufferedFileReader.hpp" + +// Boost libraries +#include + +// C standard libraries +// C libraries +#include +#include + +// C++ standard libraries +#include + +// Project headers +#include + +using std::make_unique; +using std::move; +using std::string; + +BufferedFileReader::BufferedFileReader () { + m_file_pos = 0; + m_fd = -1; + m_checkpoint_enabled = 0; + constexpr size_t DefaultBufferSize = 65536; + if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { + throw "Failed to init reader buffer size\n"; + } + m_read_buffer = make_unique(m_reader_buffer_size); +} + +BufferedFileReader::~BufferedFileReader () { + close(); +} + +ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + pos = m_file_pos; + return ErrorCode_Success; +} + +static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, + size_t& num_bytes_read) { + num_bytes_read = 0; + // keep reading from the fd until seeing a 0, which means eof + while (true) { + auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); + if (bytes_read == -1) { + return ErrorCode_errno; + } + if (bytes_read == 0) { + break; + } + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_to_read == 0) { + return ErrorCode_Success; + } + } + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; +} + +ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { + size_t num_bytes_refilled; + return refill_reader_buffer (refill_size, num_bytes_refilled); +} + +ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { + num_bytes_refilled = 0; + if (false == m_checkpoint_enabled) { + // recover from a previous reset + if (m_size > refill_size) { + m_read_buffer = make_unique(refill_size); + m_buffer = m_read_buffer.get(); + } + auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get(), + refill_size, num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_size = num_bytes_refilled; + m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + } else { + if (m_size == 0) { + // if buffer is uninitialized, reset the cursor_pos + // in case it's after a seek + m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + } + + // Messy way of copying data from old buffer to new buffer + auto new_buffer = make_unique(m_size + refill_size); + memcpy(new_buffer.get(), m_read_buffer.get(), m_size); + m_read_buffer = std::move(new_buffer); + m_buffer = m_read_buffer.get(); + + auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get() + m_size, refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_size += num_bytes_refilled; + } + return ErrorCode_Success; +} + +ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + if (nullptr == buf) { + return ErrorCode_BadParam; + } + + num_bytes_read = 0; + size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; + size_t num_bytes_read_from_buffer {0}; + // keep reading until enough data is read or an eof is seen + while (true) { + auto error_code = BufferReader::try_read(buf + num_bytes_read, + num_bytes_to_read_from_buffer, + num_bytes_read_from_buffer); + if (ErrorCode_Success == error_code || + ErrorCode_EndOfFile == error_code || + ErrorCode_NotInit == error_code) { + m_file_pos += num_bytes_read_from_buffer; + num_bytes_read += num_bytes_read_from_buffer; + num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; + if (num_bytes_to_read_from_buffer == 0) { + break; + } + // refill the buffer if more bytes are to be read + error_code = refill_reader_buffer(m_reader_buffer_size); + if (ErrorCode_EndOfFile == error_code) { + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } else { + break; + } + } + else if (ErrorCode_Success != error_code) { + return error_code; + } + } else { + return error_code; + } + } + return ErrorCode_Success; +} + +// Maybe everytime, I should always read a page? +ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { + if (m_fd == -1) { + return ErrorCode_NotInit; + } + // early return path + if (pos == m_file_pos) { + return ErrorCode_Success; + } + + if (pos <= m_file_pos) { + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); + return ErrorCode_Failure; + } + if (pos < m_checkpointed_pos) { + SPDLOG_ERROR("Seek back before the checkpoint is not supported"); + return ErrorCode_Failure; + } + m_cursor_pos -= (m_file_pos - pos); + m_file_pos = pos; + } else { + auto buffer_available_data = m_size - m_cursor_pos; + auto seek_distance = pos - m_file_pos; + if (seek_distance <= buffer_available_data) { + m_cursor_pos += seek_distance; + m_file_pos = pos; + } else if (false == m_checkpoint_enabled) { + auto buffer_aligned_pos = pos & m_reader_buffer_aligned_mask; + auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); + if (offset == -1) { + return ErrorCode_errno; + } + // invalidate buffered_data by setting size to 0 + m_size = 0; + m_file_pos = pos; + } else { + size_t num_bytes_to_refill = seek_distance - buffer_available_data; + size_t num_bytes_refilled {0}; + while (true) { + auto error_code = refill_reader_buffer(m_reader_buffer_size, num_bytes_refilled); + if (ErrorCode_EndOfFile == error_code) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw; + } + else if (ErrorCode_Success != error_code) { + return error_code; + } + if (num_bytes_to_refill <= m_reader_buffer_size) { + m_file_pos = pos; + m_cursor_pos += seek_distance; + break; + } + num_bytes_to_refill -= num_bytes_refilled; + } + } + } + return ErrorCode_Success; +} + + +ErrorCode BufferedFileReader::try_open (const string& path) { + // Cleanup in case caller forgot to call close before calling this function + close(); + + m_fd = ::open(path.c_str(), O_RDONLY); + if (-1 == m_fd) { + if (ENOENT == errno) { + return ErrorCode_FileNotFound; + } + return ErrorCode_errno; + } + m_path = path; + m_file_pos = 0; + reset_buffer(m_read_buffer.get(), 0); + return ErrorCode_Success; +} + +ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, string& str) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + + if (false == append) { + str.clear(); + } + + bool found_delim {false}; + size_t delim_pos; + while (false == found_delim) { + auto remaining_data_size = m_size - m_cursor_pos; + // find the pointer pointing to the delimiter + const auto* delim_ptr = reinterpret_cast(memchr(m_buffer + m_cursor_pos, + delim, remaining_data_size)); + if (delim_ptr != nullptr) { + delim_pos = (delim_ptr - m_buffer) + 1; + found_delim = true; + } else { + delim_pos = m_size; + } + // append to strings + size_t copy_length = delim_pos - m_cursor_pos; + std::string_view substr(reinterpret_cast(m_buffer + m_cursor_pos), + copy_length); + str.append(substr); + // increment file pos to the delimiter or the end of file + m_file_pos += copy_length; + m_cursor_pos = delim_pos; + if (false == found_delim) { + if (auto error_code = refill_reader_buffer(m_reader_buffer_size); + ErrorCode_Success != error_code) { + return error_code; + } + } + } + return ErrorCode_Success; +} + +void BufferedFileReader::open (const string& path) { + ErrorCode error_code = try_open(path); + if (ErrorCode_Success != error_code) { + if (ErrorCode_FileNotFound == error_code) { + throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; + } else { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + } +} + +void BufferedFileReader::revert_pos() { + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Checkpoint is not enabled"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_file_pos = m_checkpointed_pos; + m_cursor_pos = m_checkpointed_buffer_pos; +} + +void BufferedFileReader::mark_pos() { + if (true == m_checkpoint_enabled) { + SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_checkpointed_pos = m_file_pos; + m_checkpointed_buffer_pos = m_cursor_pos; + m_checkpoint_enabled = true; +} + +void BufferedFileReader::reset_checkpoint () { + // alternatively, we can keep claiming back the memory + if (false == m_checkpoint_enabled) { + return; + } + if (m_size != m_reader_buffer_size) { + // allocate new buffer for buffered data that hasn't been seek passed + auto copy_pos = m_cursor_pos & m_reader_buffer_aligned_mask; + auto copy_size = m_size - copy_pos; + // Use a quantized size for the new buffer size + auto new_buffer_size = (1 + ((copy_size - 1) >> m_reader_buffer_exp)) + << m_reader_buffer_exp; + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), m_buffer + copy_pos, copy_size); + m_read_buffer = std::move(new_buffer); + m_buffer = m_read_buffer.get(); + + m_size = copy_size; + m_cursor_pos -= copy_pos; + } + m_checkpoint_enabled = false; +} + +ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { + if (m_fd != -1) { + SPDLOG_ERROR("Buffer size can not be changed when the file is open"); + return ErrorCode_Failure; + } + if (buffer_size == 0) { + SPDLOG_ERROR("Buffer size can not be set to 0"); + return ErrorCode_BadParam; + } + if (buffer_size % 4096 != 0) { + SPDLOG_ERROR("Buffer size {} is not a multiple of page size", buffer_size); + return ErrorCode_BadParam; + } + // Calculate the logarithm base 2 of the number + double exponent = log(buffer_size) / log(2); + if (ceil(exponent) != floor(exponent)) { + SPDLOG_ERROR("Buffer size {} is not a power of 2", buffer_size); + return ErrorCode_BadParam; + } + + m_reader_buffer_exp = static_cast(exponent); + m_reader_buffer_size = buffer_size; + m_reader_buffer_aligned_mask = ~(m_reader_buffer_size - 1); + m_reader_buffer_cursor_mask = m_reader_buffer_size - 1; + return ErrorCode_Success; +} + +void BufferedFileReader::close () { + if (-1 != m_fd) { + // NOTE: We don't check errors for fclose since it seems + // the only reason it could fail is if it was interrupted by a signal + ::close(m_fd); + m_fd = -1; + + if (m_checkpoint_enabled) { + SPDLOG_DEBUG("close file without resetting checkpoint"); + m_read_buffer = make_unique(m_reader_buffer_size); + m_checkpoint_enabled = false; + } + } +} + +ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { + if (-1 == m_fd) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + auto return_value = fstat(m_fd, &stat_buffer); + if (0 != return_value) { + return ErrorCode_errno; + } + return ErrorCode_Success; +} diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp new file mode 100644 index 000000000..64f488c49 --- /dev/null +++ b/components/core/src/BufferedFileReader.hpp @@ -0,0 +1,140 @@ +#ifndef BufferedFileReader_HPP +#define BufferedFileReader_HPP + +// C standard libraries + +// C++ libraries +#include +#include +#include + +// Project headers +#include "Defs.h" +#include "ErrorCode.hpp" +#include "BufferReader.hpp" +#include "TraceableException.hpp" + + +class BufferedFileReader : public BufferReader { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : + TraceableException (error_code, filename, line_number) {} + + // Methods + [[nodiscard]] const char* what () const noexcept override { + return "BufferedFileReader operation failed"; + } + }; + + // Constructors + BufferedFileReader(); + ~BufferedFileReader(); + // Methods implementing the ReaderInterface + /** + * Tries to get the current position of the read head in the file + * @param pos Position of the read head in the file + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + /** + * Tries to seek from the beginning of the file to the given position + * @param pos + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + + /** + * Tries to read up to a given number of bytes from the file + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCode_NotInit if the file is not open + * @return ErrorCode_BadParam if buf is invalid + * @return ErrorCode_errno on error + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) override; + + /** + * Tries to read a string from the file until it reaches the specified delimiter + * @param delim The delimiter to stop at + * @param keep_delimiter Whether to include the delimiter in the output string or not + * @param append Whether to append to the given string or replace its contents + * @param str The string read + * @return ErrorCode_Success on success + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_errno otherwise + */ + [[nodiscard]] ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, std::string& str) override; + + // Methods + [[nodiscard]] bool is_open () const { return -1 != m_fd; } + + /** + * Tries to open a file + * @param path + * @return ErrorCode_Success on success + * @return ErrorCode_FileNotFound if the file was not found + * @return ErrorCode_errno otherwise + */ + [[nodiscard]] ErrorCode try_open (const std::string& path); + /** + * Opens a file + * @param path + * @throw BufferedFileReader::OperationFailed on failure + */ + void open (const std::string& path); + /** + * Closes the file if it's open + */ + void close (); + + [[nodiscard]] const std::string& get_path () const { return m_path; } + + /** + * Tries to stat the current file + * @param stat_buffer + * @return ErrorCode_errno on error + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; + + void mark_pos(); + void revert_pos(); + void reset_checkpoint (); + [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); + +private: + [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); + [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); + + // Types + size_t m_file_pos; + int m_fd; + std::string m_path; + + // Buffer specific data + std::unique_ptr m_read_buffer; + size_t m_reader_buffer_exp; + size_t m_reader_buffer_size; + size_t m_reader_buffer_aligned_mask; + size_t m_reader_buffer_cursor_mask; + // checkpoint specific data + bool m_checkpoint_enabled; + size_t m_checkpointed_pos; + size_t m_checkpointed_buffer_pos; +}; + + +#endif // BufferedFileReader diff --git a/components/core/src/FileReader.cpp b/components/core/src/FileReader.cpp index 9ee9284a3..f1b740d8b 100644 --- a/components/core/src/FileReader.cpp +++ b/components/core/src/FileReader.cpp @@ -4,270 +4,82 @@ #include // C standard libraries -// C libraries +#include #include -#include +#include -// C++ standard libraries +// C++ libraries +#include #include -// Project headers -#include - -using std::make_unique; -using std::move; using std::string; -FileReader::FileReader () { - m_file_pos = 0; - m_fd = -1; - m_checkpoint_enabled = 0; - constexpr size_t DefaultBufferSize = 65536; - if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { - throw "Failed to init reader buffer size\n"; - } - m_read_buffer = make_unique(m_reader_buffer_size); -} - FileReader::~FileReader () { close(); + free(m_getdelim_buf); } -ErrorCode FileReader::try_get_pos (size_t& pos) { - if (-1 == m_fd) { +ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (nullptr == m_file) { return ErrorCode_NotInit; } - pos = m_file_pos; - return ErrorCode_Success; -} + if (nullptr == buf) { + return ErrorCode_BadParam; + } -static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, - size_t& num_bytes_read) { - num_bytes_read = 0; - // keep reading from the fd until seeing a 0, which means eof - while (true) { - auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); - if (bytes_read == -1) { + num_bytes_read = fread(buf, sizeof(*buf), num_bytes_to_read, m_file); + if (num_bytes_read < num_bytes_to_read) { + if (ferror(m_file)) { return ErrorCode_errno; - } - if (bytes_read == 0) { - break; - } - num_bytes_read += bytes_read; - num_bytes_to_read -= bytes_read; - if (num_bytes_to_read == 0) { - return ErrorCode_Success; + } else if (feof(m_file)) { + if (0 == num_bytes_read) { + return ErrorCode_EndOfFile; + } } } - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; -} - -ErrorCode FileReader::refill_reader_buffer (size_t refill_size) { - size_t num_bytes_refilled; - return refill_reader_buffer (refill_size, num_bytes_refilled); -} - -ErrorCode FileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { - num_bytes_refilled = 0; - if (false == m_checkpoint_enabled) { - // recover from a previous reset - if (m_size > refill_size) { - m_read_buffer = make_unique(refill_size); - m_buffer = m_read_buffer.get(); - } - auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get(), - refill_size, num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_size = num_bytes_refilled; - m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; - } else { - if (m_size == 0) { - // if buffer is uninitialized, reset the cursor_pos - // in case it's after a seek - m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; - } - - // Messy way of copying data from old buffer to new buffer - auto new_buffer = make_unique(m_size + refill_size); - memcpy(new_buffer.get(), m_read_buffer.get(), m_size); - m_read_buffer = std::move(new_buffer); - m_buffer = m_read_buffer.get(); - auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get() + m_size, refill_size, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_size += num_bytes_refilled; - } return ErrorCode_Success; } -ErrorCode FileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (-1 == m_fd) { +ErrorCode FileReader::try_seek_from_begin (size_t pos) { + if (nullptr == m_file) { return ErrorCode_NotInit; } - if (nullptr == buf) { - return ErrorCode_BadParam; - } - num_bytes_read = 0; - size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; - size_t num_bytes_read_from_buffer {0}; - // keep reading until enough data is read or an eof is seen - while (true) { - auto error_code = BufferReader::try_read(buf + num_bytes_read, - num_bytes_to_read_from_buffer, - num_bytes_read_from_buffer); - if (ErrorCode_Success == error_code || - ErrorCode_EndOfFile == error_code || - ErrorCode_NotInit == error_code) { - m_file_pos += num_bytes_read_from_buffer; - num_bytes_read += num_bytes_read_from_buffer; - num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - if (num_bytes_to_read_from_buffer == 0) { - break; - } - // refill the buffer if more bytes are to be read - error_code = refill_reader_buffer(m_reader_buffer_size); - if (ErrorCode_EndOfFile == error_code) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } else { - break; - } - } - else if (ErrorCode_Success != error_code) { - return error_code; - } - } else { - return error_code; - } + int retval = fseeko(m_file, pos, SEEK_SET); + if (0 != retval) { + return ErrorCode_errno; } + return ErrorCode_Success; } -// Maybe everytime, I should always read a page? -ErrorCode FileReader::try_seek_from_begin (size_t pos) { - if (m_fd == -1) { +ErrorCode FileReader::try_get_pos (size_t& pos) { + if (nullptr == m_file) { return ErrorCode_NotInit; } - // early return path - if (pos == m_file_pos) { - return ErrorCode_Success; - } - if (pos <= m_file_pos) { - if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); - return ErrorCode_Failure; - } - if (pos < m_checkpointed_pos) { - SPDLOG_ERROR("Seek back before the checkpoint is not supported"); - return ErrorCode_Failure; - } - m_cursor_pos -= (m_file_pos - pos); - m_file_pos = pos; - } else { - auto buffer_available_data = m_size - m_cursor_pos; - auto seek_distance = pos - m_file_pos; - if (seek_distance <= buffer_available_data) { - m_cursor_pos += seek_distance; - m_file_pos = pos; - } else if (false == m_checkpoint_enabled) { - auto buffer_aligned_pos = pos & m_reader_buffer_aligned_mask; - auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); - if (offset == -1) { - return ErrorCode_errno; - } - // invalidate buffered_data by setting size to 0 - m_size = 0; - m_file_pos = pos; - } else { - size_t num_bytes_to_refill = seek_distance - buffer_available_data; - size_t num_bytes_refilled {0}; - while (true) { - auto error_code = refill_reader_buffer(m_reader_buffer_size, num_bytes_refilled); - if (ErrorCode_EndOfFile == error_code) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); - throw; - } - else if (ErrorCode_Success != error_code) { - return error_code; - } - if (num_bytes_to_refill <= m_reader_buffer_size) { - m_file_pos = pos; - m_cursor_pos += seek_distance; - break; - } - num_bytes_to_refill -= num_bytes_refilled; - } - } + pos = ftello(m_file); + if ((off_t)-1 == pos) { + return ErrorCode_errno; } + return ErrorCode_Success; } - ErrorCode FileReader::try_open (const string& path) { // Cleanup in case caller forgot to call close before calling this function close(); - m_fd = ::open(path.c_str(), O_RDONLY); - if (-1 == m_fd) { + m_file = fopen(path.c_str(), "rb"); + if (nullptr == m_file) { if (ENOENT == errno) { return ErrorCode_FileNotFound; } return ErrorCode_errno; } m_path = path; - m_file_pos = 0; - reset_buffer(m_read_buffer.get(), 0); - return ErrorCode_Success; -} - -ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, string& str) { - if (-1 == m_fd) { - return ErrorCode_NotInit; - } - if (false == append) { - str.clear(); - } - - bool found_delim {false}; - size_t delim_pos; - while (false == found_delim) { - auto remaining_data_size = m_size - m_cursor_pos; - // find the pointer pointing to the delimiter - const auto* delim_ptr = reinterpret_cast(memchr(m_buffer + m_cursor_pos, - delim, remaining_data_size)); - if (delim_ptr != nullptr) { - delim_pos = (delim_ptr - m_buffer) + 1; - found_delim = true; - } else { - delim_pos = m_size; - } - // append to strings - size_t copy_length = delim_pos - m_cursor_pos; - std::string_view substr(reinterpret_cast(m_buffer + m_cursor_pos), - copy_length); - str.append(substr); - // increment file pos to the delimiter or the end of file - m_file_pos += copy_length; - m_cursor_pos = delim_pos; - if (false == found_delim) { - if (auto error_code = refill_reader_buffer(m_reader_buffer_size); - ErrorCode_Success != error_code) { - return error_code; - } - } - } return ErrorCode_Success; } @@ -275,104 +87,50 @@ void FileReader::open (const string& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; + throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; } else { throw OperationFailed(error_code, __FILENAME__, __LINE__); } } } -void FileReader::revert_pos() { - if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Checkpoint is not enabled"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - m_file_pos = m_checkpointed_pos; - m_cursor_pos = m_checkpointed_buffer_pos; -} - -void FileReader::mark_pos() { - if (true == m_checkpoint_enabled) { - SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); +void FileReader::close () { + if (m_file != nullptr) { + // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if it was interrupted + // by a signal + fclose(m_file); + m_file = nullptr; } - m_checkpointed_pos = m_file_pos; - m_checkpointed_buffer_pos = m_cursor_pos; - m_checkpoint_enabled = true; } -void FileReader::reset_checkpoint () { - // alternatively, we can keep claiming back the memory - if (false == m_checkpoint_enabled) { - return; - } - if (m_size != m_reader_buffer_size) { - // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = m_cursor_pos & m_reader_buffer_aligned_mask; - auto copy_size = m_size - copy_pos; - // Use a quantized size for the new buffer size - auto new_buffer_size = (1 + ((copy_size - 1) >> m_reader_buffer_exp)) - << m_reader_buffer_exp; - - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), m_buffer + copy_pos, copy_size); - m_read_buffer = std::move(new_buffer); - m_buffer = m_read_buffer.get(); +ErrorCode FileReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, string& str) { + assert(nullptr != m_file); - m_size = copy_size; - m_cursor_pos -= copy_pos; - } - m_checkpoint_enabled = false; -} - -ErrorCode FileReader::set_buffer_size (size_t buffer_size) { - if (m_fd != -1) { - SPDLOG_ERROR("Buffer size can not be changed when the file is open"); - return ErrorCode_Failure; - } - if (buffer_size == 0) { - SPDLOG_ERROR("Buffer size can not be set to 0"); - return ErrorCode_BadParam; + if (false == append) { + str.clear(); } - if (buffer_size % 4096 != 0) { - SPDLOG_ERROR("Buffer size {} is not a multiple of page size", buffer_size); - return ErrorCode_BadParam; + ssize_t num_bytes_read = getdelim(&m_getdelim_buf, &m_getdelim_buf_len, delim, m_file); + if (num_bytes_read < 1) { + if (ferror(m_file)) { + return ErrorCode_errno; + } else if (feof(m_file)) { + return ErrorCode_EndOfFile; + } } - // Calculate the logarithm base 2 of the number - double exponent = log(buffer_size) / log(2); - if (ceil(exponent) != floor(exponent)) { - SPDLOG_ERROR("Buffer size {} is not a power of 2", buffer_size); - return ErrorCode_BadParam; + if (false == keep_delimiter && delim == m_getdelim_buf[num_bytes_read - 1]) { + --num_bytes_read; } + str.append(m_getdelim_buf, num_bytes_read); - m_reader_buffer_exp = static_cast(exponent); - m_reader_buffer_size = buffer_size; - m_reader_buffer_aligned_mask = ~(m_reader_buffer_size - 1); - m_reader_buffer_cursor_mask = m_reader_buffer_size - 1; return ErrorCode_Success; } -void FileReader::close () { - if (-1 != m_fd) { - // NOTE: We don't check errors for fclose since it seems - // the only reason it could fail is if it was interrupted by a signal - ::close(m_fd); - m_fd = -1; - - if (m_checkpoint_enabled) { - SPDLOG_DEBUG("close file without resetting checkpoint"); - m_read_buffer = make_unique(m_reader_buffer_size); - m_checkpoint_enabled = false; - } - } -} - -ErrorCode FileReader::try_fstat (struct stat& stat_buffer) const { - if (-1 == m_fd) { +ErrorCode FileReader::try_fstat (struct stat& stat_buffer) { + if (nullptr == m_file) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - auto return_value = fstat(m_fd, &stat_buffer); + auto return_value = fstat(fileno(m_file), &stat_buffer); if (0 != return_value) { return ErrorCode_errno; } diff --git a/components/core/src/FileReader.hpp b/components/core/src/FileReader.hpp index 2b8844c04..6347a0bea 100644 --- a/components/core/src/FileReader.hpp +++ b/components/core/src/FileReader.hpp @@ -1,38 +1,33 @@ -#ifndef FileReaderSys_HPP -#define FileReaderSys_HPP - -// C standard libraries +#ifndef FILEREADER_HPP +#define FILEREADER_HPP // C++ libraries #include -#include #include // Project headers #include "Defs.h" #include "ErrorCode.hpp" -#include "BufferReader.hpp" +#include "ReaderInterface.hpp" #include "TraceableException.hpp" - -class FileReader : public BufferReader { +class FileReader : public ReaderInterface { public: // Types class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : TraceableException (error_code, filename, line_number) {} // Methods - [[nodiscard]] const char* what () const noexcept override { + const char* what () const noexcept override { return "FileReader operation failed"; } }; - // Constructors - FileReader(); - ~FileReader(); + FileReader () : m_file(nullptr), m_getdelim_buf_len(0), m_getdelim_buf(nullptr) {} + ~FileReader (); + // Methods implementing the ReaderInterface /** * Tries to get the current position of the read head in the file @@ -41,7 +36,7 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + ErrorCode try_get_pos (size_t& pos) override; /** * Tries to seek from the beginning of the file to the given position * @param pos @@ -49,7 +44,7 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + ErrorCode try_seek_from_begin (size_t pos) override; /** * Tries to read up to a given number of bytes from the file @@ -62,8 +57,7 @@ class FileReader : public BufferReader { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) override; + ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; /** * Tries to read a string from the file until it reaches the specified delimiter @@ -75,11 +69,10 @@ class FileReader : public BufferReader { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, std::string& str) override; + ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; // Methods - [[nodiscard]] bool is_open () const { return -1 != m_fd; } + bool is_open () const { return m_file != nullptr; } /** * Tries to open a file @@ -88,7 +81,7 @@ class FileReader : public BufferReader { * @return ErrorCode_FileNotFound if the file was not found * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode try_open (const std::string& path); + ErrorCode try_open (const std::string& path); /** * Opens a file * @param path @@ -108,33 +101,13 @@ class FileReader : public BufferReader { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; - - void mark_pos(); - void revert_pos(); - void reset_checkpoint (); - [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); + ErrorCode try_fstat (struct stat& stat_buffer); private: - [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); - [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); - - // Types - size_t m_file_pos; - int m_fd; + FILE* m_file; + size_t m_getdelim_buf_len; + char* m_getdelim_buf; std::string m_path; - - // Buffer specific data - std::unique_ptr m_read_buffer; - size_t m_reader_buffer_exp; - size_t m_reader_buffer_size; - size_t m_reader_buffer_aligned_mask; - size_t m_reader_buffer_cursor_mask; - // checkpoint specific data - bool m_checkpoint_enabled; - size_t m_checkpointed_pos; - size_t m_checkpointed_buffer_pos; }; - -#endif // FileReaderSys_HPP +#endif // FILEREADER_HPP diff --git a/components/core/src/dictionary_utils.cpp b/components/core/src/dictionary_utils.cpp index 10a8a3542..7fd57df90 100644 --- a/components/core/src/dictionary_utils.cpp +++ b/components/core/src/dictionary_utils.cpp @@ -5,14 +5,12 @@ void open_dictionary_for_reading (const std::string& dictionary_path, const std: FileReader& segment_index_file_reader, streaming_compression::Decompressor& segment_index_decompressor) { dictionary_file_reader.open(dictionary_path); - dictionary_file_reader.mark_pos(); // Skip header dictionary_file_reader.seek_from_begin(sizeof(uint64_t)); // Open decompressor dictionary_decompressor.open(dictionary_file_reader, decompressor_file_read_buffer_capacity); segment_index_file_reader.open(segment_index_path); - segment_index_file_reader.mark_pos(); // Skip header segment_index_file_reader.seek_from_begin(sizeof(uint64_t)); // Open decompressor @@ -25,7 +23,6 @@ uint64_t read_dictionary_header (FileReader& file_reader) { uint64_t num_dictionary_entries; file_reader.read_numeric_value(num_dictionary_entries, false); file_reader.seek_from_begin(dictionary_file_reader_pos); - file_reader.reset_checkpoint(); return num_dictionary_entries; } @@ -36,6 +33,5 @@ uint64_t read_segment_index_header (FileReader& file_reader) { uint64_t num_segments; file_reader.read_numeric_value(num_segments, false); file_reader.seek_from_begin(segment_index_file_reader_pos); - file_reader.reset_checkpoint(); return num_segments; } diff --git a/components/core/tests/test-FileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp similarity index 95% rename from components/core/tests/test-FileReader.cpp rename to components/core/tests/test-BufferedFileReader.cpp index 4a473f5bd..494f00318 100644 --- a/components/core/tests/test-FileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -9,10 +9,10 @@ // Project headers #include "../src/FileWriter.hpp" -#include "../src/FileReader.hpp" +#include "../src/BufferedFileReader.hpp" #include "../src/Utils.hpp" -TEST_CASE("Test reading data", "[FileReader]") { +TEST_CASE("Test reading data", "[BufferedFileReader]") { ErrorCode error_code; // Initialize data for testing @@ -23,7 +23,7 @@ TEST_CASE("Test reading data", "[FileReader]") { test_data[i] = (char)('a' + (i % 26)); } - std::string test_file_path {"FileReader.test"}; + std::string test_file_path {"BufferedFileReader.test"}; // write to test file FileWriter file_writer; file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); @@ -31,7 +31,7 @@ TEST_CASE("Test reading data", "[FileReader]") { file_writer.close(); SECTION("General read testing") { - FileReader file_reader; + BufferedFileReader file_reader; file_reader.open(test_file_path); size_t num_bytes_read {0}; size_t buffer_offset {0}; @@ -44,7 +44,7 @@ TEST_CASE("Test reading data", "[FileReader]") { buffer_offset += num_bytes_read; // second, read a large chunk of data, so - // fileReader will refill the internal buffer + // BufferedFileReader will refill the internal buffer size_t read_size2 {65538}; REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size2, num_bytes_read)); @@ -68,7 +68,7 @@ TEST_CASE("Test reading data", "[FileReader]") { } SECTION("Simple Seek without checkpoint") { - FileReader file_reader; + BufferedFileReader file_reader; file_reader.open(test_file_path); // seek to some random position and do a read @@ -95,7 +95,7 @@ TEST_CASE("Test reading data", "[FileReader]") { } SECTION("Simple seek with checkpoint") { - FileReader file_reader; + BufferedFileReader file_reader; file_reader.open(test_file_path); // first, read some data to proceed the file_pos @@ -151,7 +151,7 @@ TEST_CASE("Test reading data", "[FileReader]") { } SECTION("Simple seek with delayed read") { - FileReader file_reader; + BufferedFileReader file_reader; file_reader.open(test_file_path); // first, read seek to some random file_pos From ccad42109cb7845f717f34e4de3099c62b6e132d Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 14 May 2023 20:15:35 -0400 Subject: [PATCH 023/121] More refactoring --- components/core/src/BufferReader.cpp | 12 +- components/core/src/BufferReader.hpp | 14 ++- components/core/src/BufferedFileReader.cpp | 58 +++++---- components/core/src/BufferedFileReader.hpp | 5 +- .../core/tests/test-BufferedFileReader.cpp | 118 +++++++----------- 5 files changed, 97 insertions(+), 110 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 55d8a6a0f..b14c5069e 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -9,7 +9,7 @@ using std::string_view; [[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { - if (nullptr == m_buffer || 0 == m_size) { + if (nullptr == m_data || 0 == m_size) { return ErrorCode_NotInit; } pos = m_cursor_pos; @@ -17,7 +17,7 @@ using std::string_view; } [[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { - if (nullptr == m_buffer || 0 == m_size) { + if (nullptr == m_data || 0 == m_size) { return ErrorCode_NotInit; } if (pos > m_size) { @@ -32,19 +32,19 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n // but we need this strong behavior for the upper class num_bytes_read = 0; - if (nullptr == m_buffer) { + if (nullptr == m_data) { return ErrorCode_NotInit; } if (nullptr == buf) { return ErrorCode_BadParam; } - if (m_cursor_pos >= m_size) { + if (m_size == 0 && m_cursor_pos != 0) { return ErrorCode_EndOfFile; } num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); - memcpy(buf, m_buffer + m_cursor_pos, num_bytes_read); + memcpy(buf, m_data + m_cursor_pos, num_bytes_read); m_cursor_pos += num_bytes_read; return ErrorCode_Success; } @@ -53,7 +53,7 @@ bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size if ((m_cursor_pos + read_size) > m_size) { return false; } - str_view = string_view(reinterpret_cast(m_buffer + m_cursor_pos), + str_view = string_view(reinterpret_cast(m_data + m_cursor_pos), read_size); m_cursor_pos += read_size; return true; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 235467d49..ae9b0586e 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -23,11 +23,11 @@ class BufferReader : public ReaderInterface { return "BufferReader operation failed"; } }; - BufferReader () : m_buffer(nullptr), + BufferReader () : m_data(nullptr), m_size(0), m_cursor_pos(0) {} BufferReader (const int8_t* data, size_t size) : - m_buffer(data), + m_data(data), m_size(size), m_cursor_pos(0) {} @@ -37,7 +37,7 @@ class BufferReader : public ReaderInterface { [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; [[nodiscard]] size_t get_buffer_length() { return m_size; } void reset_buffer (const int8_t* data, size_t size) { - m_buffer = data; + m_data = data; m_size = size; m_cursor_pos = 0; } @@ -52,8 +52,12 @@ class BufferReader : public ReaderInterface { [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); protected: - - const int8_t* m_buffer; + void reset_buffer (const int8_t* data, size_t size, size_t cursor_pos) { + m_data = data; + m_size = size; + m_cursor_pos = cursor_pos; + } + const int8_t* m_data; size_t m_size; size_t m_cursor_pos; }; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index bf0ce74af..e738efc67 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -21,8 +21,7 @@ using std::string; BufferedFileReader::BufferedFileReader () { m_file_pos = 0; m_fd = -1; - m_checkpoint_enabled = 0; - constexpr size_t DefaultBufferSize = 65536; + m_checkpoint_enabled = false; if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { throw "Failed to init reader buffer size\n"; } @@ -70,45 +69,51 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { return refill_reader_buffer (refill_size, num_bytes_refilled); } -ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { +ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, + size_t& num_bytes_refilled) { num_bytes_refilled = 0; if (false == m_checkpoint_enabled) { // recover from a previous reset if (m_size > refill_size) { m_read_buffer = make_unique(refill_size); - m_buffer = m_read_buffer.get(); } auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get(), refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_size = num_bytes_refilled; - m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; - } else { - if (m_size == 0) { - // if buffer is uninitialized, reset the cursor_pos - // in case it's after a seek + m_cursor_pos = 0; + if (m_data == nullptr) { m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + m_data = m_read_buffer.get(); } + m_size = num_bytes_refilled; + } else { // Messy way of copying data from old buffer to new buffer auto new_buffer = make_unique(m_size + refill_size); memcpy(new_buffer.get(), m_read_buffer.get(), m_size); m_read_buffer = std::move(new_buffer); - m_buffer = m_read_buffer.get(); - auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get() + m_size, refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } + + if (nullptr == m_data) { + // if buffer is uninitialized, reset the cursor_pos + // in case it's after a seek + m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + } + m_data = m_read_buffer.get(); m_size += num_bytes_refilled; + } return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -152,7 +157,6 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, siz return ErrorCode_Success; } -// Maybe everytime, I should always read a page? ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (m_fd == -1) { return ErrorCode_NotInit; @@ -167,7 +171,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); return ErrorCode_Failure; } - if (pos < m_checkpointed_pos) { + if (pos < m_checkpoint_pos) { SPDLOG_ERROR("Seek back before the checkpoint is not supported"); return ErrorCode_Failure; } @@ -185,8 +189,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (offset == -1) { return ErrorCode_errno; } - // invalidate buffered_data by setting size to 0 - m_size = 0; + // invalidate buffered_data + reset_buffer(nullptr, 0); m_file_pos = pos; } else { size_t num_bytes_to_refill = seek_distance - buffer_available_data; @@ -245,17 +249,18 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim while (false == found_delim) { auto remaining_data_size = m_size - m_cursor_pos; // find the pointer pointing to the delimiter - const auto* delim_ptr = reinterpret_cast(memchr(m_buffer + m_cursor_pos, - delim, remaining_data_size)); + const auto* delim_ptr = + reinterpret_cast(memchr(m_data + m_cursor_pos, + delim, remaining_data_size)); if (delim_ptr != nullptr) { - delim_pos = (delim_ptr - m_buffer) + 1; + delim_pos = (delim_ptr - m_data) + 1; found_delim = true; } else { delim_pos = m_size; } // append to strings size_t copy_length = delim_pos - m_cursor_pos; - std::string_view substr(reinterpret_cast(m_buffer + m_cursor_pos), + std::string_view substr(reinterpret_cast(m_data + m_cursor_pos), copy_length); str.append(substr); // increment file pos to the delimiter or the end of file @@ -287,18 +292,19 @@ void BufferedFileReader::revert_pos() { SPDLOG_ERROR("Checkpoint is not enabled"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - m_file_pos = m_checkpointed_pos; + m_file_pos = m_checkpoint_pos; m_cursor_pos = m_checkpointed_buffer_pos; } -void BufferedFileReader::mark_pos() { +size_t BufferedFileReader::mark_pos() { if (true == m_checkpoint_enabled) { SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - m_checkpointed_pos = m_file_pos; + m_checkpoint_pos = m_file_pos; m_checkpointed_buffer_pos = m_cursor_pos; m_checkpoint_enabled = true; + return m_file_pos; } void BufferedFileReader::reset_checkpoint () { @@ -315,9 +321,9 @@ void BufferedFileReader::reset_checkpoint () { << m_reader_buffer_exp; auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), m_buffer + copy_pos, copy_size); + memcpy(new_buffer.get(), m_data + copy_pos, copy_size); m_read_buffer = std::move(new_buffer); - m_buffer = m_read_buffer.get(); + m_data = m_read_buffer.get(); m_size = copy_size; m_cursor_pos -= copy_pos; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 64f488c49..a66da679b 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -17,6 +17,7 @@ class BufferedFileReader : public BufferReader { public: + static constexpr size_t DefaultBufferSize = 65536; // Types class OperationFailed : public TraceableException { public: @@ -110,7 +111,7 @@ class BufferedFileReader : public BufferReader { */ [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; - void mark_pos(); + size_t mark_pos(); void revert_pos(); void reset_checkpoint (); [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); @@ -132,7 +133,7 @@ class BufferedFileReader : public BufferReader { size_t m_reader_buffer_cursor_mask; // checkpoint specific data bool m_checkpoint_enabled; - size_t m_checkpointed_pos; + size_t m_checkpoint_pos; size_t m_checkpointed_buffer_pos; }; diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 494f00318..c4d5b3394 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -10,10 +10,8 @@ // Project headers #include "../src/FileWriter.hpp" #include "../src/BufferedFileReader.hpp" -#include "../src/Utils.hpp" TEST_CASE("Test reading data", "[BufferedFileReader]") { - ErrorCode error_code; // Initialize data for testing size_t test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 @@ -60,7 +58,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(0 == memcmp(read_buffer, test_data, test_data_size)); - // lastly, make sure the buffer is drained out + // lastly, make sure the file reaches eof size_t read_size4 = 1; REQUIRE(ErrorCode_EndOfFile == file_reader.try_read(read_buffer + buffer_offset, read_size4, num_bytes_read)); @@ -72,94 +70,96 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { file_reader.open(test_file_path); // seek to some random position and do a read - size_t seek_pos {245}; - size_t num_bytes_to_read = 65540; + size_t seek_pos1 {245}; + size_t num_bytes_to_read {65540}; size_t num_byte_read; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos1)); REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos1, num_bytes_to_read)); // seek front to random position and do a read - seek_pos = 345212; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + size_t seek_pos2 {345212}; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos2)); REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos2, num_bytes_to_read)); // the seek should fail on a backward seek when checkpoint is not enabled - seek_pos -= 1; - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos2)); } - SECTION("Simple seek with checkpoint") { + SECTION("seek with checkpoint") { BufferedFileReader file_reader; file_reader.open(test_file_path); - // first, read some data to proceed the file_pos - size_t num_bytes_to_read = 65540; size_t num_byte_read; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + + // first, read some data to advance the file_pos + size_t num_bytes_to_read_1 = 65540; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_1, num_byte_read)); - REQUIRE(file_reader.get_pos() == num_bytes_to_read); + REQUIRE(file_reader.get_pos() == num_bytes_to_read_1); // set a checkpoint - size_t checkpoint_pos = file_reader.get_pos(); - file_reader.mark_pos(); + size_t checkpoint_pos = file_reader.mark_pos(); // keep reading some data - num_bytes_to_read = 345212; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + size_t num_bytes_to_read_2 = 345212; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); - size_t latest_file_pos = checkpoint_pos + num_bytes_to_read; - REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); + REQUIRE(file_reader.get_pos() == num_bytes_to_read_1 + num_bytes_to_read_2); + size_t latest_file_pos = file_reader.get_pos(); // now seek back to some where between - size_t seek_pos = file_reader.get_pos() / 2; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + size_t seek_pos_1 = checkpoint_pos + 500; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos_1)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); - // get the latest file_pos - latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos_1, num_bytes_to_read_2)); + // update the latest_file_pos if necessary + latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); - // now try to seek back to an unacceptable place + // now try to seek back to a pos that's before the checkpoint REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); - // try reset, which should fail now. - // REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); // now go back to latest data REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); + // make sure data read after latest_file_pos + size_t num_bytes_to_read_3 = 4096; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_3, + num_byte_read)); + REQUIRE(num_bytes_to_read_3 == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read_3)); + // reset, and then seek back should fail file_reader.reset_checkpoint(); - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos_1)); - // make sure data read after checkpoint-reset still matches - num_bytes_to_read = 4096; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + // make sure data read after checkpoint-reset are still correct + size_t num_bytes_to_read_4 = 65780; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_4, num_byte_read)); - - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read)); + REQUIRE(num_bytes_to_read_4 == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos + num_bytes_to_read_3, + num_bytes_to_read_4)); // Make sure now we can't reset back to checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos_1)); } - SECTION("Simple seek with delayed read") { + SECTION("seek with delayed read") { BufferedFileReader file_reader; file_reader.open(test_file_path); - // first, read seek to some random file_pos + // first, advance to some random file_pos REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(45313)); // set a checkpoint - size_t checkpoint_pos = file_reader.get_pos(); - file_reader.mark_pos(); + size_t checkpoint_pos = file_reader.mark_pos(); // keep reading some data size_t num_bytes_to_read; @@ -168,7 +168,6 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { num_bytes_to_read = 345212; REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - size_t latest_file_pos = checkpoint_pos + num_bytes_to_read; REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); // now seek back to some where between @@ -178,31 +177,8 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); - // get the latest file_pos - latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); - - - // now try to seek back to an unacceptable place - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); - // try reset, which should success now. - file_reader.reset_checkpoint(); - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos)); - - // now go back to latest data - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); - - // make sure data read after checkpoint-reset still matches - num_bytes_to_read = 65536; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read)); - - // Make sure now we can't reset back to checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(latest_file_pos)); } - SECTION("Reset seek with corner cases") { - - } + delete[] test_data; + delete[] read_buffer; } \ No newline at end of file From 2324a21e5d82c1664aefa6f30a16e82eb368ee5e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 16 May 2023 12:14:22 -0400 Subject: [PATCH 024/121] remove fstat dependency from libarchive reader --- components/core/src/LibarchiveReader.cpp | 34 ++++++------------------ components/core/src/LibarchiveReader.hpp | 4 +-- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/components/core/src/LibarchiveReader.cpp b/components/core/src/LibarchiveReader.cpp index 180821030..f8cd50987 100644 --- a/components/core/src/LibarchiveReader.cpp +++ b/components/core/src/LibarchiveReader.cpp @@ -9,7 +9,7 @@ // Project headers #include "Defs.h" -ErrorCode LibarchiveReader::try_open (size_t buffer_length, const char* buffer, FileReader& file_reader, const std::string& path_if_compressed_file) { +ErrorCode LibarchiveReader::try_open (size_t buffer_length, const char* buffer, ReaderInterface& reader, const std::string& path_if_compressed_file) { // Create and initialize internal libarchive m_archive = archive_read_new(); if (nullptr == m_archive) { @@ -39,7 +39,7 @@ ErrorCode LibarchiveReader::try_open (size_t buffer_length, const char* buffer, memcpy(m_buffer.data(), buffer, buffer_length); m_initial_buffer_content_exhausted = m_buffer.empty(); - m_file_reader = &file_reader; + m_file_reader = &reader; m_filename_if_compressed = path_if_compressed_file; @@ -207,32 +207,14 @@ ErrorCode LibarchiveReader::libarchive_read_callback (const void** buffer, size_ } ErrorCode LibarchiveReader::libarchive_skip_callback (off_t num_bytes_to_skip, size_t& num_bytes_skipped) { - // Get current position - size_t pos; - auto error_code = m_file_reader->try_get_pos(pos); - if (ErrorCode_Success != error_code) { - return error_code; - } - - // Calculate desired position, ensuring its within the file - size_t desired_pos = pos + num_bytes_to_skip; - struct stat stat_buffer = {}; - error_code = m_file_reader->try_fstat(stat_buffer); - if (ErrorCode_Success != error_code) { - return error_code; - } - if (desired_pos > stat_buffer.st_size) { - desired_pos = stat_buffer.st_size; - } - - // Seek to desired position - error_code = m_file_reader->try_seek_from_begin(desired_pos); - if (ErrorCode_Success != error_code) { + std::vector temporary_read_buffer; + auto error_code = m_file_reader->try_read(temporary_read_buffer.data(), num_bytes_to_skip, + num_bytes_skipped); + if (ErrorCode_EndOfFile == error_code) { + num_bytes_skipped = 0; + } else if (ErrorCode_Success != error_code) { return error_code; } - - num_bytes_skipped = desired_pos - pos; - return ErrorCode_Success; } diff --git a/components/core/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp index f17f740d6..0f2e7a8ff 100644 --- a/components/core/src/LibarchiveReader.hpp +++ b/components/core/src/LibarchiveReader.hpp @@ -51,7 +51,7 @@ class LibarchiveReader { * @return ErrorCode_Success on success * @return ErrorCode_Failure on failure */ - ErrorCode try_open (size_t buffer_length, const char* buffer, FileReader& file_reader, const std::string& path_if_compressed_file); + ErrorCode try_open (size_t buffer_length, const char* buffer, ReaderInterface& reader, const std::string& path_if_compressed_file); /** * Closes the reader */ @@ -160,7 +160,7 @@ class LibarchiveReader { struct archive_entry* m_archive_entry; std::vector m_buffer; - FileReader* m_file_reader; + ReaderInterface* m_file_reader; bool m_initial_buffer_content_exhausted; std::string m_filename_if_compressed; From 9e48e126fdc179cc0d97872a51aff1acc4effed7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 16 May 2023 12:31:43 -0400 Subject: [PATCH 025/121] Remove code specific for BFR --- components/core/src/BufferReader.cpp | 12 +--- .../src/ffi/ir_stream/decoding_methods.cpp | 19 +----- .../core/tests/test-ir_encoding_methods.cpp | 66 +++++++++---------- 3 files changed, 37 insertions(+), 60 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index b14c5069e..777ab5950 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -9,7 +9,7 @@ using std::string_view; [[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { - if (nullptr == m_data || 0 == m_size) { + if (nullptr == m_data) { return ErrorCode_NotInit; } pos = m_cursor_pos; @@ -17,7 +17,7 @@ using std::string_view; } [[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { - if (nullptr == m_data || 0 == m_size) { + if (nullptr == m_data) { return ErrorCode_NotInit; } if (pos > m_size) { @@ -28,10 +28,6 @@ using std::string_view; } ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - // this is not defined by specifications, - // but we need this strong behavior for the upper class - num_bytes_read = 0; - if (nullptr == m_data) { return ErrorCode_NotInit; } @@ -39,10 +35,6 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_BadParam; } - if (m_size == 0 && m_cursor_pos != 0) { - return ErrorCode_EndOfFile; - } - num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); memcpy(buf, m_data + m_cursor_pos, num_bytes_read); m_cursor_pos += num_bytes_read; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index ccf511618..0e208d94c 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -466,13 +466,10 @@ namespace ffi::ir_stream { if (ir_buf.get_buffer_length() == 0) { return IRErrorCode_Incomplete_IR; } - size_t seekback_pos = ir_buf.get_pos(); - char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); if (error_code != ErrorCode_Success || num_bytes_read != cProtocol::MagicNumberLength) { - ir_buf.seek_from_begin(seekback_pos); return IRErrorCode_Incomplete_IR; } if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, @@ -482,25 +479,21 @@ namespace ffi::ir_stream { cProtocol::MagicNumberLength)) { is_four_bytes_encoding = false; } else { - ir_buf.seek_from_begin(seekback_pos); return IRErrorCode_Corrupted_IR; } return IRErrorCode_Success; } - + IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { - size_t marked_pos = ir_buf.get_pos(); if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); error_code != IRErrorCode_Success) { - ir_buf.seek_from_begin(marked_pos); return error_code; } metadata_pos = ir_buf.get_pos(); //TODO: this might not be optimal if (ErrorCode_Success != ir_buf.try_seek_from_begin(metadata_pos + metadata_size)) { - ir_buf.seek_from_begin(marked_pos); return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; @@ -520,17 +513,9 @@ namespace ffi::ir_stream { IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& timestamp) { - if (ir_buf.get_buffer_length() == 0) { - return IRErrorCode_Incomplete_IR; - } - size_t marked_pos = ir_buf.get_pos(); - auto error_code = generic_decode_next_message( + return generic_decode_next_message( ir_buf, message, timestamp ); - if (IRErrorCode_Success != error_code) { - ir_buf.seek_from_begin(marked_pos); - } - return error_code; } } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index c9885f0ba..897a5ed03 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -206,14 +206,14 @@ static void set_timestamp_info (const nlohmann::json& metadata_json, TimestampIn TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { bool is_four_bytes_encoding; - + BufferReader ir_buffer; // Test eight-byte encoding vector eight_byte_encoding_vec{EightByteEncodingMagicNumber, EightByteEncodingMagicNumber + MagicNumberLength}; - BufferReader eight_byte_ir_buffer(eight_byte_encoding_vec.data(), - eight_byte_encoding_vec.size()); - REQUIRE(get_encoding_type(eight_byte_ir_buffer, is_four_bytes_encoding) == + // Test eight-byte encoding + ir_buffer.reset_buffer(eight_byte_encoding_vec.data(), eight_byte_encoding_vec.size()); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -221,27 +221,25 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - BufferReader four_byte_ir_buffer(four_byte_encoding_vec.data(), - four_byte_encoding_vec.size()); - REQUIRE(get_encoding_type(four_byte_ir_buffer, is_four_bytes_encoding) == + ir_buffer.reset_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size()); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test error on empty and incomplete ir_buffer const vector empty_ir_vec; - BufferReader empty_ir_buffer(empty_ir_vec.data(), empty_ir_vec.size()); - REQUIRE(get_encoding_type(empty_ir_buffer, is_four_bytes_encoding) == + ir_buffer.reset_buffer(empty_ir_vec.data(), empty_ir_vec.size()); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); - BufferReader incomplete_ir_buffer(four_byte_encoding_vec.data(), - four_byte_encoding_vec.size() - 1); - REQUIRE(get_encoding_type(incomplete_ir_buffer, is_four_bytes_encoding) == + ir_buffer.reset_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size() - 1); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - BufferReader invalid_ir_buffer(invalid_ir_vec.data(), invalid_ir_vec.size()); - REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) == + ir_buffer.reset_buffer(invalid_ir_vec.data(), invalid_ir_vec.size()); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); } @@ -250,6 +248,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode eight_byte_encoded_variable_t) { vector ir_buf; + BufferReader ir_buffer; constexpr char timestamp_pattern[] = "%Y-%m-%d %H:%M:%S,%3"; constexpr char timestamp_pattern_syntax[] = "yyyy-MM-dd HH:mm:ss"; constexpr char time_zone_id[] = "Asia/Tokyo"; @@ -259,25 +258,25 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - BufferReader preamble_buffer(ir_buf.data(), ir_buf.size()); + ir_buffer.reset_buffer(ir_buf.data(), ir_buf.size()); bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(preamble_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - REQUIRE(MagicNumberLength == preamble_buffer.get_pos()); + REQUIRE(MagicNumberLength == ir_buffer.get_pos()); // Test if preamble can be decoded correctly TimestampInfo ts_info; encoded_tag_t metadata_type{0}; size_t metadata_pos{0}; uint16_t metadata_size{0}; - REQUIRE(decode_preamble(preamble_buffer, metadata_type, metadata_pos, metadata_size) == + REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == preamble_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); string_view json_metadata; - REQUIRE(ErrorCode_Success == preamble_buffer.try_seek_from_begin(metadata_pos)); - REQUIRE(preamble_buffer.try_read_string_view(json_metadata, metadata_size)); + REQUIRE(ErrorCode_Success == ir_buffer.try_seek_from_begin(metadata_pos)); + REQUIRE(ir_buffer.try_read_string_view(json_metadata, metadata_size)); auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == @@ -287,7 +286,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); REQUIRE(time_zone_id == ts_info.time_zone_id); REQUIRE(timestamp_pattern == ts_info.timestamp_pattern); - REQUIRE(encoded_preamble_end_pos == preamble_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); if constexpr (is_same_v) { REQUIRE(reference_ts == @@ -306,6 +305,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; + ir_buffer.seek_from_begin(MagicNumberLength); BufferReader corrupted_preamble_buffer(ir_buf.data(), ir_buf.size()); REQUIRE(decode_preamble( corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) == @@ -352,6 +352,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", TEST_CASE("message_decode_error", "[ffi][decode_next_message]") { vector ir_buf; + BufferReader ir_buffer; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; @@ -371,22 +372,21 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") epoch_time_ms_t timestamp; // Test if a trailing escape triggers a decoder error - auto ir_with_extra_escape{ir_buf}; + auto ir_with_extra_escape {ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - BufferReader ir_buffer_with_extra_escape(ir_with_extra_escape.data(), ir_with_extra_escape.size()); + ir_buffer.reset_buffer(ir_with_extra_escape.data(), ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_buffer_with_extra_escape, - decoded_message, timestamp)); + decode_next_message(ir_buffer, decoded_message, + timestamp)); // Test if an extra placeholder triggers a decoder error auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - BufferReader ir_buffer_with_extra_placeholder(ir_with_extra_escape.data(), - ir_with_extra_escape.size()); + ir_buffer.reset_buffer(ir_with_extra_escape.data(), ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_buffer_with_extra_placeholder, - decoded_message, timestamp)); + decode_next_message(ir_buffer, decoded_message, + timestamp)); } TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_message]") { @@ -399,12 +399,12 @@ TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_mes REQUIRE(true == encode_message(reference_delta_ts_negative, message, logtype, ir_buf)); - BufferReader encoded_message_buffer(ir_buf.data(), ir_buf.size()); + BufferReader ir_buffer(ir_buf.data(), ir_buf.size()); string decoded_message; epoch_time_ms_t delta_ts; REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(encoded_message_buffer, - decoded_message, delta_ts)); + decode_next_message(ir_buffer, decoded_message, + delta_ts)); REQUIRE(message == decoded_message); REQUIRE(delta_ts == reference_delta_ts_negative); } From 57dd3105b862996fe136095900a6407fce62a385 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 16 May 2023 13:09:45 -0400 Subject: [PATCH 026/121] Remove BFR dependency on BR --- components/core/src/BufferedFileReader.cpp | 110 +++++++++--------- components/core/src/BufferedFileReader.hpp | 17 ++- .../core/tests/test-BufferedFileReader.cpp | 2 + 3 files changed, 69 insertions(+), 60 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index e738efc67..aec553d5a 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -25,7 +25,7 @@ BufferedFileReader::BufferedFileReader () { if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { throw "Failed to init reader buffer size\n"; } - m_read_buffer = make_unique(m_reader_buffer_size); + m_buffer = make_unique(m_reader_buffer_size); } BufferedFileReader::~BufferedFileReader () { @@ -75,37 +75,35 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, if (false == m_checkpoint_enabled) { // recover from a previous reset if (m_size > refill_size) { - m_read_buffer = make_unique(refill_size); + m_buffer = make_unique(refill_size); } - auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get(), + auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_cursor_pos = 0; - if (m_data == nullptr) { - m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; - m_data = m_read_buffer.get(); + m_cursor = 0; + if (m_size == 0) { + m_cursor = m_file_pos & m_reader_buffer_cursor_mask; } m_size = num_bytes_refilled; } else { // Messy way of copying data from old buffer to new buffer auto new_buffer = make_unique(m_size + refill_size); - memcpy(new_buffer.get(), m_read_buffer.get(), m_size); - m_read_buffer = std::move(new_buffer); - auto error_code = try_read_into_buffer(m_fd, m_read_buffer.get() + m_size, refill_size, + memcpy(new_buffer.get(), m_buffer.get(), m_size); + m_buffer = std::move(new_buffer); + auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + m_size, refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - if (nullptr == m_data) { + if (0 == m_size) { // if buffer is uninitialized, reset the cursor_pos // in case it's after a seek - m_cursor_pos = m_file_pos & m_reader_buffer_cursor_mask; + m_cursor = m_file_pos & m_reader_buffer_cursor_mask; } - m_data = m_read_buffer.get(); m_size += num_bytes_refilled; } @@ -126,34 +124,31 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, size_t num_bytes_read_from_buffer {0}; // keep reading until enough data is read or an eof is seen while (true) { - auto error_code = BufferReader::try_read(buf + num_bytes_read, - num_bytes_to_read_from_buffer, - num_bytes_read_from_buffer); - if (ErrorCode_Success == error_code || - ErrorCode_EndOfFile == error_code || - ErrorCode_NotInit == error_code) { - m_file_pos += num_bytes_read_from_buffer; - num_bytes_read += num_bytes_read_from_buffer; + // if the data in the buffer is valid + if (m_size != 0) { + num_bytes_read_from_buffer = std::min(num_bytes_to_read_from_buffer, + buffer_remaining_data()); + memcpy(buf + num_bytes_read, buffer_head(), num_bytes_read_from_buffer); num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - if (num_bytes_to_read_from_buffer == 0) { - break; - } - // refill the buffer if more bytes are to be read - error_code = refill_reader_buffer(m_reader_buffer_size); - if (ErrorCode_EndOfFile == error_code) { - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } else { - break; - } - } - else if (ErrorCode_Success != error_code) { - return error_code; - } - } else { + num_bytes_read += num_bytes_read_from_buffer; + m_cursor += num_bytes_read_from_buffer; + m_file_pos += num_bytes_read_from_buffer; + } + if (num_bytes_to_read_from_buffer == 0) { + break; + } + // refill the buffer if more bytes are to be read + auto error_code = refill_reader_buffer(m_reader_buffer_size); + if (ErrorCode_EndOfFile == error_code) { + break; + } else if (ErrorCode_Success != error_code) { return error_code; } } + + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } return ErrorCode_Success; } @@ -175,13 +170,13 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { SPDLOG_ERROR("Seek back before the checkpoint is not supported"); return ErrorCode_Failure; } - m_cursor_pos -= (m_file_pos - pos); + m_cursor -= (m_file_pos - pos); m_file_pos = pos; } else { - auto buffer_available_data = m_size - m_cursor_pos; + auto buffer_available_data = buffer_remaining_data(); auto seek_distance = pos - m_file_pos; if (seek_distance <= buffer_available_data) { - m_cursor_pos += seek_distance; + m_cursor += seek_distance; m_file_pos = pos; } else if (false == m_checkpoint_enabled) { auto buffer_aligned_pos = pos & m_reader_buffer_aligned_mask; @@ -190,7 +185,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_errno; } // invalidate buffered_data - reset_buffer(nullptr, 0); + // Maybe we need a flag for it? + m_size = 0; m_file_pos = pos; } else { size_t num_bytes_to_refill = seek_distance - buffer_available_data; @@ -206,7 +202,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } if (num_bytes_to_refill <= m_reader_buffer_size) { m_file_pos = pos; - m_cursor_pos += seek_distance; + m_cursor += seek_distance; break; } num_bytes_to_refill -= num_bytes_refilled; @@ -230,7 +226,8 @@ ErrorCode BufferedFileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - reset_buffer(m_read_buffer.get(), 0); + m_size = 0; + m_cursor = 0; return ErrorCode_Success; } @@ -247,25 +244,25 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim bool found_delim {false}; size_t delim_pos; while (false == found_delim) { - auto remaining_data_size = m_size - m_cursor_pos; + auto remaining_data_size = m_size - m_cursor; // find the pointer pointing to the delimiter const auto* delim_ptr = - reinterpret_cast(memchr(m_data + m_cursor_pos, + reinterpret_cast(memchr(buffer_head(), delim, remaining_data_size)); if (delim_ptr != nullptr) { - delim_pos = (delim_ptr - m_data) + 1; + delim_pos = (delim_ptr - m_buffer.get()) + 1; found_delim = true; } else { delim_pos = m_size; } // append to strings - size_t copy_length = delim_pos - m_cursor_pos; - std::string_view substr(reinterpret_cast(m_data + m_cursor_pos), + size_t copy_length = delim_pos - m_cursor; + std::string_view substr(reinterpret_cast(buffer_head()), copy_length); str.append(substr); // increment file pos to the delimiter or the end of file m_file_pos += copy_length; - m_cursor_pos = delim_pos; + m_cursor = delim_pos; if (false == found_delim) { if (auto error_code = refill_reader_buffer(m_reader_buffer_size); ErrorCode_Success != error_code) { @@ -293,7 +290,7 @@ void BufferedFileReader::revert_pos() { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_file_pos = m_checkpoint_pos; - m_cursor_pos = m_checkpointed_buffer_pos; + m_cursor = m_checkpointed_buffer_pos; } size_t BufferedFileReader::mark_pos() { @@ -302,7 +299,7 @@ size_t BufferedFileReader::mark_pos() { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_checkpoint_pos = m_file_pos; - m_checkpointed_buffer_pos = m_cursor_pos; + m_checkpointed_buffer_pos = m_cursor; m_checkpoint_enabled = true; return m_file_pos; } @@ -314,19 +311,18 @@ void BufferedFileReader::reset_checkpoint () { } if (m_size != m_reader_buffer_size) { // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = m_cursor_pos & m_reader_buffer_aligned_mask; + auto copy_pos = m_cursor & m_reader_buffer_aligned_mask; auto copy_size = m_size - copy_pos; // Use a quantized size for the new buffer size auto new_buffer_size = (1 + ((copy_size - 1) >> m_reader_buffer_exp)) << m_reader_buffer_exp; auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), m_data + copy_pos, copy_size); - m_read_buffer = std::move(new_buffer); - m_data = m_read_buffer.get(); + memcpy(new_buffer.get(), m_buffer.get() + copy_pos, copy_size); + m_buffer = std::move(new_buffer); m_size = copy_size; - m_cursor_pos -= copy_pos; + m_cursor -= copy_pos; } m_checkpoint_enabled = false; } @@ -367,7 +363,7 @@ void BufferedFileReader::close () { if (m_checkpoint_enabled) { SPDLOG_DEBUG("close file without resetting checkpoint"); - m_read_buffer = make_unique(m_reader_buffer_size); + m_buffer = make_unique(m_reader_buffer_size); m_checkpoint_enabled = false; } } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index a66da679b..75e86b258 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -11,11 +11,11 @@ // Project headers #include "Defs.h" #include "ErrorCode.hpp" -#include "BufferReader.hpp" +#include "ReaderInterface.hpp" #include "TraceableException.hpp" -class BufferedFileReader : public BufferReader { +class BufferedFileReader : public ReaderInterface { public: static constexpr size_t DefaultBufferSize = 65536; // Types @@ -117,6 +117,13 @@ class BufferedFileReader : public BufferReader { [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); private: + + [[nodiscard]] size_t buffer_remaining_data() { + return m_size - m_cursor; + } + [[nodiscard]] int8_t* buffer_head() { + return m_buffer.get() + m_cursor; + } [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); @@ -126,7 +133,11 @@ class BufferedFileReader : public BufferReader { std::string m_path; // Buffer specific data - std::unique_ptr m_read_buffer; + std::unique_ptr m_buffer; + size_t m_size; + size_t m_cursor; + + // constant flag size_t m_reader_buffer_exp; size_t m_reader_buffer_size; size_t m_reader_buffer_aligned_mask; diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index c4d5b3394..670ade836 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -39,6 +39,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size1, num_bytes_read)); REQUIRE(read_size1 == num_bytes_read); + REQUIRE(0 == memcmp(read_buffer, test_data, read_size1)); buffer_offset += num_bytes_read; // second, read a large chunk of data, so @@ -47,6 +48,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size2, num_bytes_read)); REQUIRE(read_size2 == num_bytes_read); + REQUIRE(0 == memcmp(read_buffer, test_data, read_size1 + read_size2)); buffer_offset += num_bytes_read; // third, read remaining data From d84276ca89f9e9aa0a47b207aa4a9983a36cefdf Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 16 May 2023 20:22:14 -0400 Subject: [PATCH 027/121] remove m_cursor and replace with buffer begin_pos --- components/core/src/BufferedFileReader.cpp | 96 +++++++++------------- components/core/src/BufferedFileReader.hpp | 10 +-- 2 files changed, 42 insertions(+), 64 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index aec553d5a..c148be7f9 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -32,6 +32,15 @@ BufferedFileReader::~BufferedFileReader () { close(); } +size_t BufferedFileReader::remaining_data_size () +{ + if (m_size == 0) { + return 0; + } + assert(m_size >= cursor_pos()); + return m_size - cursor_pos(); +} + ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { if (-1 == m_fd) { return ErrorCode_NotInit; @@ -82,28 +91,18 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, if (error_code != ErrorCode_Success) { return error_code; } - m_cursor = 0; - if (m_size == 0) { - m_cursor = m_file_pos & m_reader_buffer_cursor_mask; - } + m_buffer_begin_pos = m_file_pos & m_reader_buffer_aligned_mask; m_size = num_bytes_refilled; - } else { // Messy way of copying data from old buffer to new buffer auto new_buffer = make_unique(m_size + refill_size); memcpy(new_buffer.get(), m_buffer.get(), m_size); - m_buffer = std::move(new_buffer); - auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + m_size, refill_size, + auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_size], refill_size, num_bytes_refilled); + m_buffer = std::move(new_buffer); if (error_code != ErrorCode_Success) { return error_code; } - - if (0 == m_size) { - // if buffer is uninitialized, reset the cursor_pos - // in case it's after a seek - m_cursor = m_file_pos & m_reader_buffer_cursor_mask; - } m_size += num_bytes_refilled; } @@ -121,19 +120,16 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, num_bytes_read = 0; size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; - size_t num_bytes_read_from_buffer {0}; // keep reading until enough data is read or an eof is seen while (true) { - // if the data in the buffer is valid - if (m_size != 0) { - num_bytes_read_from_buffer = std::min(num_bytes_to_read_from_buffer, - buffer_remaining_data()); - memcpy(buf + num_bytes_read, buffer_head(), num_bytes_read_from_buffer); - num_bytes_to_read_from_buffer -= num_bytes_read_from_buffer; - num_bytes_read += num_bytes_read_from_buffer; - m_cursor += num_bytes_read_from_buffer; - m_file_pos += num_bytes_read_from_buffer; - } + auto avaiable_bytes_for_read = std::min(num_bytes_to_read_from_buffer, + remaining_data_size()); + memcpy(buf + num_bytes_read, buffer_head(), avaiable_bytes_for_read); + + num_bytes_to_read_from_buffer -= avaiable_bytes_for_read; + num_bytes_read += avaiable_bytes_for_read; + + m_file_pos += avaiable_bytes_for_read; if (num_bytes_to_read_from_buffer == 0) { break; } @@ -170,24 +166,18 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { SPDLOG_ERROR("Seek back before the checkpoint is not supported"); return ErrorCode_Failure; } - m_cursor -= (m_file_pos - pos); - m_file_pos = pos; } else { - auto buffer_available_data = buffer_remaining_data(); + auto buffer_available_data = remaining_data_size(); auto seek_distance = pos - m_file_pos; if (seek_distance <= buffer_available_data) { - m_cursor += seek_distance; - m_file_pos = pos; } else if (false == m_checkpoint_enabled) { - auto buffer_aligned_pos = pos & m_reader_buffer_aligned_mask; - auto offset = lseek(m_fd, buffer_aligned_pos, SEEK_SET); + m_buffer_begin_pos = pos & m_reader_buffer_aligned_mask; + auto offset = lseek(m_fd, m_buffer_begin_pos, SEEK_SET); if (offset == -1) { return ErrorCode_errno; } // invalidate buffered_data - // Maybe we need a flag for it? m_size = 0; - m_file_pos = pos; } else { size_t num_bytes_to_refill = seek_distance - buffer_available_data; size_t num_bytes_refilled {0}; @@ -197,18 +187,19 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { SPDLOG_ERROR("not expecting to seek pass the Entire file"); throw; } - else if (ErrorCode_Success != error_code) { + if (ErrorCode_Success != error_code) { return error_code; } if (num_bytes_to_refill <= m_reader_buffer_size) { - m_file_pos = pos; - m_cursor += seek_distance; break; } num_bytes_to_refill -= num_bytes_refilled; } } } + + // update the m_file_pos at success + m_file_pos = pos; return ErrorCode_Success; } @@ -227,7 +218,7 @@ ErrorCode BufferedFileReader::try_open (const string& path) { m_path = path; m_file_pos = 0; m_size = 0; - m_cursor = 0; + m_buffer_begin_pos = 0; return ErrorCode_Success; } @@ -244,11 +235,9 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim bool found_delim {false}; size_t delim_pos; while (false == found_delim) { - auto remaining_data_size = m_size - m_cursor; // find the pointer pointing to the delimiter - const auto* delim_ptr = - reinterpret_cast(memchr(buffer_head(), - delim, remaining_data_size)); + const auto* delim_ptr = reinterpret_cast(memchr(buffer_head(), delim, + remaining_data_size())); if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_buffer.get()) + 1; found_delim = true; @@ -256,13 +245,10 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim delim_pos = m_size; } // append to strings - size_t copy_length = delim_pos - m_cursor; - std::string_view substr(reinterpret_cast(buffer_head()), - copy_length); - str.append(substr); + size_t str_length = delim_pos - cursor_pos(); + str.append(reinterpret_cast(buffer_head()), str_length); // increment file pos to the delimiter or the end of file - m_file_pos += copy_length; - m_cursor = delim_pos; + m_file_pos += str_length; if (false == found_delim) { if (auto error_code = refill_reader_buffer(m_reader_buffer_size); ErrorCode_Success != error_code) { @@ -290,7 +276,6 @@ void BufferedFileReader::revert_pos() { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_file_pos = m_checkpoint_pos; - m_cursor = m_checkpointed_buffer_pos; } size_t BufferedFileReader::mark_pos() { @@ -299,7 +284,6 @@ size_t BufferedFileReader::mark_pos() { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_checkpoint_pos = m_file_pos; - m_checkpointed_buffer_pos = m_cursor; m_checkpoint_enabled = true; return m_file_pos; } @@ -311,18 +295,15 @@ void BufferedFileReader::reset_checkpoint () { } if (m_size != m_reader_buffer_size) { // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = m_cursor & m_reader_buffer_aligned_mask; - auto copy_size = m_size - copy_pos; + auto copy_pos = cursor_pos() & m_reader_buffer_aligned_mask; + m_size -= copy_pos; // Use a quantized size for the new buffer size - auto new_buffer_size = (1 + ((copy_size - 1) >> m_reader_buffer_exp)) - << m_reader_buffer_exp; + auto new_buffer_size = (1 + ((m_size - 1) >> m_reader_buffer_exp)) << m_reader_buffer_exp; auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), m_buffer.get() + copy_pos, copy_size); + memcpy(new_buffer.get(), &m_buffer[copy_pos], m_size); m_buffer = std::move(new_buffer); - - m_size = copy_size; - m_cursor -= copy_pos; + m_buffer_begin_pos += copy_pos; } m_checkpoint_enabled = false; } @@ -350,7 +331,6 @@ ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { m_reader_buffer_exp = static_cast(exponent); m_reader_buffer_size = buffer_size; m_reader_buffer_aligned_mask = ~(m_reader_buffer_size - 1); - m_reader_buffer_cursor_mask = m_reader_buffer_size - 1; return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 75e86b258..7c819067f 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -117,12 +117,10 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); private: - - [[nodiscard]] size_t buffer_remaining_data() { - return m_size - m_cursor; - } + [[nodiscard]] size_t cursor_pos() { return m_file_pos - m_buffer_begin_pos; } + [[nodiscard]] size_t remaining_data_size(); [[nodiscard]] int8_t* buffer_head() { - return m_buffer.get() + m_cursor; + return m_buffer.get() + cursor_pos(); } [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); @@ -135,7 +133,7 @@ class BufferedFileReader : public ReaderInterface { // Buffer specific data std::unique_ptr m_buffer; size_t m_size; - size_t m_cursor; + size_t m_buffer_begin_pos; // constant flag size_t m_reader_buffer_exp; From 58458054562850a13a666d49341bd6aba0da8099 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 17 May 2023 14:58:32 -0400 Subject: [PATCH 028/121] Add UTF8 utility to the FBR. --- components/core/src/BufferedFileReader.cpp | 82 ++++++++++++++-------- components/core/src/BufferedFileReader.hpp | 18 ++--- components/core/src/Utils.cpp | 34 +++++++++ components/core/src/Utils.hpp | 8 +++ components/core/src/clp/FileCompressor.cpp | 1 + components/core/src/clp/utils.cpp | 34 --------- components/core/src/clp/utils.hpp | 8 --- 7 files changed, 106 insertions(+), 79 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index c148be7f9..ade3969b7 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -13,17 +13,23 @@ // Project headers #include +#include "Utils.hpp" using std::make_unique; using std::move; using std::string; +static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, + size_t& num_bytes_read); + BufferedFileReader::BufferedFileReader () { m_file_pos = 0; m_fd = -1; m_checkpoint_enabled = false; - if (ErrorCode_Success != set_buffer_size(DefaultBufferSize)) { - throw "Failed to init reader buffer size\n"; + if (auto error_code = set_buffer_size(cDefaultBufferSize); + ErrorCode_Success != error_code) { + SPDLOG_ERROR("Failed to init reader buffer size to be {}", cDefaultBufferSize); + throw OperationFailed(error_code, __FILENAME__, __LINE__); } m_buffer = make_unique(m_reader_buffer_size); } @@ -32,8 +38,7 @@ BufferedFileReader::~BufferedFileReader () { close(); } -size_t BufferedFileReader::remaining_data_size () -{ +size_t BufferedFileReader::remaining_data_size () const { if (m_size == 0) { return 0; } @@ -49,30 +54,6 @@ ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } -static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, - size_t& num_bytes_read) { - num_bytes_read = 0; - // keep reading from the fd until seeing a 0, which means eof - while (true) { - auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); - if (bytes_read == -1) { - return ErrorCode_errno; - } - if (bytes_read == 0) { - break; - } - num_bytes_read += bytes_read; - num_bytes_to_read -= bytes_read; - if (num_bytes_to_read == 0) { - return ErrorCode_Success; - } - } - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; -} - ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { size_t num_bytes_refilled; return refill_reader_buffer (refill_size, num_bytes_refilled); @@ -193,6 +174,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (num_bytes_to_refill <= m_reader_buffer_size) { break; } + // do we need to distinguish num_bytes_refilled vs m_reader_buffer_size + // since we might anyway terminate the loop? num_bytes_to_refill -= num_bytes_refilled; } } @@ -203,6 +186,25 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } +ErrorCode BufferedFileReader::is_utf8_encoded (bool& is_utf8) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + if (m_file_pos != 0) { + return ErrorCode_Unsupported; + } + // Refill the buffer if necessary + if (0 == m_size) { + if (auto error_code = refill_reader_buffer(m_reader_buffer_size); + ErrorCode_Success != error_code && + ErrorCode_EndOfFile != error_code) { + return error_code; + } + } + auto bytes_to_verify = std::min(cPageSize, m_size); + is_utf8 = is_utf8_sequence(bytes_to_verify, reinterpret_cast(m_buffer.get())); + return ErrorCode_Success; +} ErrorCode BufferedFileReader::try_open (const string& path) { // Cleanup in case caller forgot to call close before calling this function @@ -360,3 +362,27 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { } return ErrorCode_Success; } + +static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, + size_t& num_bytes_read) { + num_bytes_read = 0; + // keep reading from the fd until seeing a 0, which means eof + while (true) { + auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); + if (bytes_read == -1) { + return ErrorCode_errno; + } + if (bytes_read == 0) { + break; + } + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_to_read == 0) { + return ErrorCode_Success; + } + } + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; +} diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 7c819067f..46a71fe78 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -17,7 +17,8 @@ class BufferedFileReader : public ReaderInterface { public: - static constexpr size_t DefaultBufferSize = 65536; + static constexpr size_t cDefaultBufferSize = 65536; + static constexpr size_t cPageSize = 4096; // Types class OperationFailed : public TraceableException { public: @@ -111,17 +112,18 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; + [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); + + [[nodiscard]] ErrorCode is_utf8_encoded(bool& is_utf8); + size_t mark_pos(); void revert_pos(); void reset_checkpoint (); - [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); private: - [[nodiscard]] size_t cursor_pos() { return m_file_pos - m_buffer_begin_pos; } - [[nodiscard]] size_t remaining_data_size(); - [[nodiscard]] int8_t* buffer_head() { - return m_buffer.get() + cursor_pos(); - } + [[nodiscard]] size_t cursor_pos() const { return m_file_pos - m_buffer_begin_pos; } + [[nodiscard]] size_t remaining_data_size() const; + [[nodiscard]] int8_t* buffer_head() const { return m_buffer.get() + cursor_pos(); } [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); @@ -139,11 +141,9 @@ class BufferedFileReader : public ReaderInterface { size_t m_reader_buffer_exp; size_t m_reader_buffer_size; size_t m_reader_buffer_aligned_mask; - size_t m_reader_buffer_cursor_mask; // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpoint_pos; - size_t m_checkpointed_buffer_pos; }; diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index 328cdfd4c..1e9098109 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -215,3 +215,37 @@ ErrorCode read_list_of_paths (const string& list_path, vector& paths) { return ErrorCode_Success; } + +bool is_utf8_sequence (size_t sequence_length, const char* sequence) { + size_t num_utf8_bytes_to_read = 0; + for (size_t i = 0; i < sequence_length; ++i) { + auto byte = sequence[i]; + + if (num_utf8_bytes_to_read > 0) { + // Validate that byte matches 0b10xx_xxxx + if ((byte & 0xC0) != 0x80) { + return false; + } + --num_utf8_bytes_to_read; + } else { + if (byte & 0x80) { + // Check if byte is valid UTF-8 length-indicator + if ((byte & 0xF8) == 0xF0) { + // Matches 0b1111_0xxx + num_utf8_bytes_to_read = 3; + } else if ((byte & 0xF0) == 0xE0) { + // Matches 0b1110_xxxx + num_utf8_bytes_to_read = 2; + } else if ((byte & 0xE0) == 0xC0) { + // Matches 0b110x_xxxx + num_utf8_bytes_to_read = 1; + } else { + // Invalid UTF-8 length-indicator + return false; + } + } // else byte is ASCII + } + } + + return true; +} \ No newline at end of file diff --git a/components/core/src/Utils.hpp b/components/core/src/Utils.hpp index 6f8b843f3..a6a3ad635 100644 --- a/components/core/src/Utils.hpp +++ b/components/core/src/Utils.hpp @@ -108,4 +108,12 @@ std::string get_unambiguous_path (const std::string& path); */ ErrorCode read_list_of_paths (const std::string& list_path, std::vector& paths); +/** + * Checks if the given sequence is valid UTF-8 + * @param sequence_length + * @param sequence + * @return true if valid, false otherwise + */ +bool is_utf8_sequence (size_t sequence_length, const char* sequence); + #endif // UTILS_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index e75382d2b..8a788fcb9 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -13,6 +13,7 @@ // Project headers #include "../Profiler.hpp" +#include "../Utils.hpp" #include "utils.hpp" using std::cout; diff --git a/components/core/src/clp/utils.cpp b/components/core/src/clp/utils.cpp index b0fb883e9..5869673b1 100644 --- a/components/core/src/clp/utils.cpp +++ b/components/core/src/clp/utils.cpp @@ -64,40 +64,6 @@ namespace clp { return true; } - bool is_utf8_sequence (size_t sequence_length, const char* sequence) { - size_t num_utf8_bytes_to_read = 0; - for (size_t i = 0; i < sequence_length; ++i) { - auto byte = sequence[i]; - - if (num_utf8_bytes_to_read > 0) { - // Validate that byte matches 0b10xx_xxxx - if ((byte & 0xC0) != 0x80) { - return false; - } - --num_utf8_bytes_to_read; - } else { - if (byte & 0x80) { - // Check if byte is valid UTF-8 length-indicator - if ((byte & 0xF8) == 0xF0) { - // Matches 0b1111_0xxx - num_utf8_bytes_to_read = 3; - } else if ((byte & 0xF0) == 0xE0) { - // Matches 0b1110_xxxx - num_utf8_bytes_to_read = 2; - } else if ((byte & 0xE0) == 0xC0) { - // Matches 0b110x_xxxx - num_utf8_bytes_to_read = 1; - } else { - // Invalid UTF-8 length-indicator - return false; - } - } // else byte is ASCII - } - } - - return true; - } - bool read_input_paths (const string& list_path, vector& paths) { ErrorCode error_code = read_list_of_paths(list_path, paths); if (ErrorCode_Success != error_code) { diff --git a/components/core/src/clp/utils.hpp b/components/core/src/clp/utils.hpp index 0796d510c..f5f8e9438 100644 --- a/components/core/src/clp/utils.hpp +++ b/components/core/src/clp/utils.hpp @@ -24,14 +24,6 @@ namespace clp { bool find_all_files_and_empty_directories (boost::filesystem::path& path_prefix_to_remove, const std::string& path, std::vector& file_paths, std::vector& empty_directory_paths); - /** - * Checks if the given sequence is valid UTF-8 - * @param sequence_length - * @param sequence - * @return true if valid, false otherwise - */ - bool is_utf8_sequence (size_t sequence_length, const char* sequence); - /** * Reads a list of input paths * @param list_path From b90e0d6c587a0fb328bccc2d585afaf1f1325ad8 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 17 May 2023 22:12:27 -0400 Subject: [PATCH 029/121] replace UTF8 validation logic --- components/core/CMakeLists.txt | 2 + components/core/src/BufferedFileReader.cpp | 17 +++--- components/core/src/BufferedFileReader.hpp | 6 +- components/core/src/LibarchiveFileReader.cpp | 58 +++++++++++++++++++- components/core/src/LibarchiveFileReader.hpp | 17 +++++- components/core/src/LibarchiveReader.cpp | 44 ++++++--------- components/core/src/LibarchiveReader.hpp | 6 +- components/core/src/Utils.cpp | 34 ------------ components/core/src/Utils.hpp | 8 --- components/core/src/clp/FileCompressor.cpp | 39 +++++-------- components/core/src/clp/FileCompressor.hpp | 6 +- components/core/src/clp/utils.cpp | 34 ++++++++++++ components/core/src/clp/utils.hpp | 8 +++ 13 files changed, 163 insertions(+), 116 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 51c83a75b..262ce172d 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -159,6 +159,8 @@ FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}") set(SOURCE_FILES_clp src/ArrayBackedPosIntSet.cpp src/ArrayBackedPosIntSet.hpp + src/BufferedFileReader.cpp + src/BufferedFileReader.hpp src/BufferReader.cpp src/BufferReader.hpp src/clp/clp.cpp diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index ade3969b7..6a6378bc0 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -186,23 +186,22 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::is_utf8_encoded (bool& is_utf8) { +ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const char*& data_ptr, + size_t& peek_size) { if (-1 == m_fd) { return ErrorCode_NotInit; } - if (m_file_pos != 0) { - return ErrorCode_Unsupported; - } // Refill the buffer if necessary if (0 == m_size) { - if (auto error_code = refill_reader_buffer(m_reader_buffer_size); - ErrorCode_Success != error_code && - ErrorCode_EndOfFile != error_code) { + auto error_code = refill_reader_buffer(m_reader_buffer_size); + if (ErrorCode_Success != error_code) { + data_ptr = nullptr; + peek_size = 0; return error_code; } } - auto bytes_to_verify = std::min(cPageSize, m_size); - is_utf8 = is_utf8_sequence(bytes_to_verify, reinterpret_cast(m_buffer.get())); + peek_size = std::min(size_to_peek, remaining_data_size()); + data_ptr = reinterpret_cast(buffer_head()); return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 46a71fe78..bcfb0ac38 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -18,7 +18,6 @@ class BufferedFileReader : public ReaderInterface { public: static constexpr size_t cDefaultBufferSize = 65536; - static constexpr size_t cPageSize = 4096; // Types class OperationFailed : public TraceableException { public: @@ -113,9 +112,8 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); - - [[nodiscard]] ErrorCode is_utf8_encoded(bool& is_utf8); - + [[nodiscard]] ErrorCode peek_buffered_data(size_t size_to_peek, const char*& data_ptr, + size_t& peek_size); size_t mark_pos(); void revert_pos(); void reset_checkpoint (); diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 60c41178f..b04948023 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -167,6 +167,62 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter (char delim, bool keep_del return ErrorCode_Success; } +ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char*& data_ptr, + size_t& peek_size) { + if (nullptr == m_archive) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + if (nullptr == m_archive_entry) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + // Read a data block if necessary + if (nullptr == m_data_block) { + auto error_code = read_next_data_block(); + if (ErrorCode_Success != error_code) { + data_ptr = nullptr; + peek_size = 0; + return error_code; + } + } + + // If we don't need to simulate reading '\0' before the start of the data block + // simply return a const& to the current data block + if (m_data_block_pos_in_file <= m_pos_in_file) { + peek_size = std::min(size_to_peek, m_data_block_length - m_pos_in_data_block); + data_ptr = reinterpret_cast(m_data_block); + return ErrorCode_Success; + } + + // If there are sparse bytes before the data block, the pos in data block + // must be 0 + assert(m_pos_in_data_block != 0); + + auto num_sparse_bytes = m_data_block_pos_in_file - m_pos_in_file; + peek_size = std::min(num_sparse_bytes + m_data_block_length, size_to_peek); + + // resize the local buffer is necessary + if (m_peek_data_size < peek_size) { + m_data_for_peek = std::make_unique(peek_size); + m_peek_data_size = peek_size; + } + data_ptr = reinterpret_cast(m_data_for_peek.get()); + + if (size_to_peek < num_sparse_bytes) { + memset(m_data_for_peek.get(), '\0', size_to_peek); + return ErrorCode_Success; + } + + // if size to peek is greater than number of sparse bytes, + // copy over the data from data_block to the peek data buffer + memset(m_data_for_peek.get(), '\0', num_sparse_bytes); + size_t remaining_bytes_to_peek = peek_size - num_sparse_bytes; + const char* data = reinterpret_cast(m_data_block); + memcpy(&m_data_for_peek[num_sparse_bytes], data, remaining_bytes_to_peek); + + return ErrorCode_Success; +} + void LibarchiveFileReader::open (struct archive* archive, struct archive_entry* archive_entry) { if (nullptr == archive) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); @@ -212,4 +268,4 @@ ErrorCode LibarchiveFileReader::read_next_data_block () { m_pos_in_data_block = 0; return ErrorCode_Success; -} +} \ No newline at end of file diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index 97b4f2473..0e33579e6 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -2,6 +2,7 @@ #define LIBARCHIVEFILEREADER_HPP // C++ standard libraries +#include #include // libarchive @@ -30,7 +31,8 @@ class LibarchiveFileReader : public ReaderInterface { }; // Constructors - LibarchiveFileReader () : m_archive(nullptr), m_archive_entry(nullptr), m_data_block(nullptr), m_reached_eof(false), m_pos_in_file(0) {} + LibarchiveFileReader () : m_archive(nullptr), m_archive_entry(nullptr), m_data_block(nullptr), + m_reached_eof(false), m_pos_in_file(0), m_peek_data_size(0) {} // Methods implementing the ReaderInterface /** @@ -69,6 +71,17 @@ class LibarchiveFileReader : public ReaderInterface { */ ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; + /** + * Tries to peek up to a given number of bytes from the file. + * Note: This function only tries to peek within the next data block. + * @param size_to_peek + * @param data_ptr + * @param peek_size + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Failure on failure + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode peek_data_block(size_t size_to_peek, const char*& data_ptr, size_t& peek_size); // Methods /** * Opens the file reader @@ -98,6 +111,8 @@ class LibarchiveFileReader : public ReaderInterface { la_int64_t m_data_block_pos_in_file; const void* m_data_block; size_t m_data_block_length; + std::unique_ptr m_data_for_peek; + size_t m_peek_data_size; la_int64_t m_pos_in_data_block; bool m_reached_eof; diff --git a/components/core/src/LibarchiveReader.cpp b/components/core/src/LibarchiveReader.cpp index f8cd50987..d993ec7a0 100644 --- a/components/core/src/LibarchiveReader.cpp +++ b/components/core/src/LibarchiveReader.cpp @@ -9,7 +9,7 @@ // Project headers #include "Defs.h" -ErrorCode LibarchiveReader::try_open (size_t buffer_length, const char* buffer, ReaderInterface& reader, const std::string& path_if_compressed_file) { +ErrorCode LibarchiveReader::try_open (ReaderInterface& file_reader, const std::string& path_if_compressed_file) { // Create and initialize internal libarchive m_archive = archive_read_new(); if (nullptr == m_archive) { @@ -34,17 +34,11 @@ ErrorCode LibarchiveReader::try_open (size_t buffer_length, const char* buffer, throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - // Copy initial buffer content - m_buffer.resize(buffer_length); - memcpy(m_buffer.data(), buffer, buffer_length); - m_initial_buffer_content_exhausted = m_buffer.empty(); - - m_file_reader = &reader; - + m_file_reader = &file_reader; m_filename_if_compressed = path_if_compressed_file; - - return_value = archive_read_open2(m_archive, this, libarchive_open_callback, libarchive_read_callback, libarchive_skip_callback, + return_value = archive_read_open2(m_archive, this, libarchive_open_callback, + libarchive_read_callback, libarchive_skip_callback, libarchive_close_callback); if (ARCHIVE_OK != return_value) { SPDLOG_DEBUG("Failed to open libarchive - {}", archive_error_string(m_archive)); @@ -186,27 +180,23 @@ ErrorCode LibarchiveReader::libarchive_read_callback (const void** buffer, size_ if (false == m_is_opened_by_libarchive) { return ErrorCode_NotInit; } - - if (false == m_initial_buffer_content_exhausted) { - *buffer = m_buffer.data(); - num_bytes_read = m_buffer.size(); - m_initial_buffer_content_exhausted = true; - } else { - constexpr size_t cTargetBufferLength = 4096; - m_buffer.resize(cTargetBufferLength); - auto error_code = m_file_reader->try_read(m_buffer.data(), cTargetBufferLength, num_bytes_read); - if (ErrorCode_Success != error_code) { - return error_code; - } - if (num_bytes_read < cTargetBufferLength) { - m_buffer.resize(num_bytes_read); - } + constexpr size_t cTargetBufferLength = 4096; + m_buffer.resize(cTargetBufferLength); + auto error_code = m_file_reader->try_read(m_buffer.data(), cTargetBufferLength, num_bytes_read); + if (ErrorCode_Success != error_code) { + return error_code; } - + if (num_bytes_read < cTargetBufferLength) { + m_buffer.resize(num_bytes_read); + } + *buffer = m_buffer.data(); return ErrorCode_Success; } -ErrorCode LibarchiveReader::libarchive_skip_callback (off_t num_bytes_to_skip, size_t& num_bytes_skipped) { +ErrorCode LibarchiveReader::libarchive_skip_callback (off_t num_bytes_to_skip, + size_t& num_bytes_skipped) { + + // skip bytes by simply reading data into a temporary buffer std::vector temporary_read_buffer; auto error_code = m_file_reader->try_read(temporary_read_buffer.data(), num_bytes_to_skip, num_bytes_skipped); diff --git a/components/core/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp index 0f2e7a8ff..1e583fca4 100644 --- a/components/core/src/LibarchiveReader.hpp +++ b/components/core/src/LibarchiveReader.hpp @@ -43,15 +43,13 @@ class LibarchiveReader { // Methods /** - * Tries to open the archive or compressed file contained across the given buffer and FileReader - * @param buffer_length - * @param buffer + * Tries to open the archive or compressed file contained in the FileReader * @param file_reader * @param path_if_compressed_file Path to use if the data is a single compressed file * @return ErrorCode_Success on success * @return ErrorCode_Failure on failure */ - ErrorCode try_open (size_t buffer_length, const char* buffer, ReaderInterface& reader, const std::string& path_if_compressed_file); + ErrorCode try_open (ReaderInterface& file_reader, const std::string& path_if_compressed_file); /** * Closes the reader */ diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index 1e9098109..328cdfd4c 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -215,37 +215,3 @@ ErrorCode read_list_of_paths (const string& list_path, vector& paths) { return ErrorCode_Success; } - -bool is_utf8_sequence (size_t sequence_length, const char* sequence) { - size_t num_utf8_bytes_to_read = 0; - for (size_t i = 0; i < sequence_length; ++i) { - auto byte = sequence[i]; - - if (num_utf8_bytes_to_read > 0) { - // Validate that byte matches 0b10xx_xxxx - if ((byte & 0xC0) != 0x80) { - return false; - } - --num_utf8_bytes_to_read; - } else { - if (byte & 0x80) { - // Check if byte is valid UTF-8 length-indicator - if ((byte & 0xF8) == 0xF0) { - // Matches 0b1111_0xxx - num_utf8_bytes_to_read = 3; - } else if ((byte & 0xF0) == 0xE0) { - // Matches 0b1110_xxxx - num_utf8_bytes_to_read = 2; - } else if ((byte & 0xE0) == 0xC0) { - // Matches 0b110x_xxxx - num_utf8_bytes_to_read = 1; - } else { - // Invalid UTF-8 length-indicator - return false; - } - } // else byte is ASCII - } - } - - return true; -} \ No newline at end of file diff --git a/components/core/src/Utils.hpp b/components/core/src/Utils.hpp index a6a3ad635..6f8b843f3 100644 --- a/components/core/src/Utils.hpp +++ b/components/core/src/Utils.hpp @@ -108,12 +108,4 @@ std::string get_unambiguous_path (const std::string& path); */ ErrorCode read_list_of_paths (const std::string& list_path, std::vector& paths); -/** - * Checks if the given sequence is valid UTF-8 - * @param sequence_length - * @param sequence - * @return true if valid, false otherwise - */ -bool is_utf8_sequence (size_t sequence_length, const char* sequence); - #endif // UTILS_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 8a788fcb9..e81fbc1f7 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -13,7 +13,6 @@ // Project headers #include "../Profiler.hpp" -#include "../Utils.hpp" #include "utils.hpp" using std::cout; @@ -91,12 +90,13 @@ namespace clp { m_file_reader.open(file_to_compress.get_path()); // Check that file is UTF-8 encoded - auto error_code = m_file_reader.try_read(m_utf8_validation_buf, cUtf8ValidationBufCapacity, m_utf8_validation_buf_length); - if (ErrorCode_Success != error_code) { - if (ErrorCode_EndOfFile != error_code) { - SPDLOG_ERROR("Failed to read {}, errno={}", file_to_compress.get_path().c_str(), errno); - return false; - } + if (auto error_code = m_file_reader.peek_buffered_data(cUtf8ValidationBufCapacity, + m_utf8_validation_buf, + m_utf8_validation_buf_length); + ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { + SPDLOG_ERROR("Failed to peek data from {}, errno={}", + file_to_compress.get_path().c_str(), errno); + return error_code; } bool succeeded = true; if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { @@ -140,7 +140,6 @@ namespace clp { // TODO: decide what to actually do about this // for now reset reader rather than try reading m_utf8_validation_buf as it would be // very awkward to combine sources to/in the parser - reader.seek_from_begin(0); m_log_parser->set_archive_writer_ptr(&archive_writer); m_log_parser->get_archive_writer_ptr()->old_ts_pattern.clear(); try { @@ -172,19 +171,7 @@ namespace clp { // Open compressed file archive_writer.create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - // Parse content from UTF-8 validation buffer - size_t buf_pos = 0; - while (m_message_parser.parse_next_message(false, m_utf8_validation_buf_length, m_utf8_validation_buf, buf_pos, m_parsed_message)) { - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { - split_file_and_archive(archive_user_config, path_for_compression, group_id, m_parsed_message.get_ts_patt(), archive_writer); - } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= target_encoded_file_size) { - split_file(path_for_compression, group_id, m_parsed_message.get_ts_patt(), archive_writer); - } - - write_message_to_encoded_file(m_parsed_message, archive_writer); - } - - // Parse remaining content from file + // Parse content from file while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { split_file_and_archive(archive_user_config, path_for_compression, group_id, m_parsed_message.get_ts_patt(), archive_writer); @@ -214,7 +201,7 @@ namespace clp { } // Check if it's an archive - auto error_code = m_libarchive_reader.try_open(m_utf8_validation_buf_length, m_utf8_validation_buf, m_file_reader, filename_if_compressed); + auto error_code = m_libarchive_reader.try_open(m_file_reader, filename_if_compressed); if (ErrorCode_Success != error_code) { SPDLOG_ERROR("Cannot compress {} - failed to open with libarchive.", file_to_compress.get_path().c_str()); return false; @@ -262,12 +249,14 @@ namespace clp { } m_libarchive_reader.open_file_reader(m_libarchive_file_reader); - + error_code = m_libarchive_file_reader.peek_data_block(cUtf8ValidationBufCapacity, + m_utf8_validation_buf, + m_utf8_validation_buf_length); // Check that file is UTF-8 encoded - error_code = m_libarchive_file_reader.try_read(m_utf8_validation_buf, cUtf8ValidationBufCapacity, m_utf8_validation_buf_length); if (ErrorCode_Success != error_code) { if (ErrorCode_EndOfFile != error_code) { - SPDLOG_ERROR("Failed to read {} from {}.", m_libarchive_reader.get_path(), file_to_compress.get_path().c_str()); + SPDLOG_ERROR("Failed to peek data from {}, errno={}", + file_to_compress.get_path().c_str(), errno); m_libarchive_file_reader.close(); succeeded = false; continue; diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index faa6d0a07..e2fe3d92d 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -5,7 +5,7 @@ #include // Project headers -#include "../FileReader.hpp" +#include "../BufferedFileReader.hpp" #include "../LibarchiveFileReader.hpp" #include "../LibarchiveReader.hpp" #include "../MessageParser.hpp" @@ -77,10 +77,10 @@ namespace clp { // Variables boost::uuids::random_generator& m_uuid_generator; - FileReader m_file_reader; + BufferedFileReader m_file_reader; LibarchiveReader m_libarchive_reader; LibarchiveFileReader m_libarchive_file_reader; - char m_utf8_validation_buf[cUtf8ValidationBufCapacity]; + const char* m_utf8_validation_buf; size_t m_utf8_validation_buf_length; MessageParser m_message_parser; ParsedMessage m_parsed_message; diff --git a/components/core/src/clp/utils.cpp b/components/core/src/clp/utils.cpp index 5869673b1..b0fb883e9 100644 --- a/components/core/src/clp/utils.cpp +++ b/components/core/src/clp/utils.cpp @@ -64,6 +64,40 @@ namespace clp { return true; } + bool is_utf8_sequence (size_t sequence_length, const char* sequence) { + size_t num_utf8_bytes_to_read = 0; + for (size_t i = 0; i < sequence_length; ++i) { + auto byte = sequence[i]; + + if (num_utf8_bytes_to_read > 0) { + // Validate that byte matches 0b10xx_xxxx + if ((byte & 0xC0) != 0x80) { + return false; + } + --num_utf8_bytes_to_read; + } else { + if (byte & 0x80) { + // Check if byte is valid UTF-8 length-indicator + if ((byte & 0xF8) == 0xF0) { + // Matches 0b1111_0xxx + num_utf8_bytes_to_read = 3; + } else if ((byte & 0xF0) == 0xE0) { + // Matches 0b1110_xxxx + num_utf8_bytes_to_read = 2; + } else if ((byte & 0xE0) == 0xC0) { + // Matches 0b110x_xxxx + num_utf8_bytes_to_read = 1; + } else { + // Invalid UTF-8 length-indicator + return false; + } + } // else byte is ASCII + } + } + + return true; + } + bool read_input_paths (const string& list_path, vector& paths) { ErrorCode error_code = read_list_of_paths(list_path, paths); if (ErrorCode_Success != error_code) { diff --git a/components/core/src/clp/utils.hpp b/components/core/src/clp/utils.hpp index f5f8e9438..0796d510c 100644 --- a/components/core/src/clp/utils.hpp +++ b/components/core/src/clp/utils.hpp @@ -24,6 +24,14 @@ namespace clp { bool find_all_files_and_empty_directories (boost::filesystem::path& path_prefix_to_remove, const std::string& path, std::vector& file_paths, std::vector& empty_directory_paths); + /** + * Checks if the given sequence is valid UTF-8 + * @param sequence_length + * @param sequence + * @return true if valid, false otherwise + */ + bool is_utf8_sequence (size_t sequence_length, const char* sequence); + /** * Reads a list of input paths * @param list_path From e9230d0ca0e8e137a00e2ef94f6a86f2e4d6cbde Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 18 May 2023 17:25:46 -0400 Subject: [PATCH 030/121] Some personal preference --- components/core/src/LibarchiveFileReader.cpp | 11 +++++------ components/core/src/LibarchiveFileReader.hpp | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index b04948023..ff40deb2f 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -202,20 +202,19 @@ ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char peek_size = std::min(num_sparse_bytes + m_data_block_length, size_to_peek); // resize the local buffer is necessary - if (m_peek_data_size < peek_size) { - m_data_for_peek = std::make_unique(peek_size); - m_peek_data_size = peek_size; + if (m_data_for_peek.size() < peek_size) { + m_data_for_peek.resize(peek_size); } - data_ptr = reinterpret_cast(m_data_for_peek.get()); + data_ptr = reinterpret_cast(m_data_for_peek.data()); if (size_to_peek < num_sparse_bytes) { - memset(m_data_for_peek.get(), '\0', size_to_peek); + memset(m_data_for_peek.data(), '\0', size_to_peek); return ErrorCode_Success; } // if size to peek is greater than number of sparse bytes, // copy over the data from data_block to the peek data buffer - memset(m_data_for_peek.get(), '\0', num_sparse_bytes); + memset(m_data_for_peek.data(), '\0', num_sparse_bytes); size_t remaining_bytes_to_peek = peek_size - num_sparse_bytes; const char* data = reinterpret_cast(m_data_block); memcpy(&m_data_for_peek[num_sparse_bytes], data, remaining_bytes_to_peek); diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index 0e33579e6..a6a843705 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -4,6 +4,7 @@ // C++ standard libraries #include #include +#include // libarchive #include @@ -32,7 +33,7 @@ class LibarchiveFileReader : public ReaderInterface { // Constructors LibarchiveFileReader () : m_archive(nullptr), m_archive_entry(nullptr), m_data_block(nullptr), - m_reached_eof(false), m_pos_in_file(0), m_peek_data_size(0) {} + m_reached_eof(false), m_pos_in_file(0) {} // Methods implementing the ReaderInterface /** @@ -111,8 +112,7 @@ class LibarchiveFileReader : public ReaderInterface { la_int64_t m_data_block_pos_in_file; const void* m_data_block; size_t m_data_block_length; - std::unique_ptr m_data_for_peek; - size_t m_peek_data_size; + std::vector m_data_for_peek; la_int64_t m_pos_in_data_block; bool m_reached_eof; From 27af2035775d09a042fee3a0e92155d99c403247 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 18 May 2023 17:26:24 -0400 Subject: [PATCH 031/121] remove m_cursor and replace with buffer begin_pos --- components/core/src/LibarchiveFileReader.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index a6a843705..9552dd915 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -2,7 +2,6 @@ #define LIBARCHIVEFILEREADER_HPP // C++ standard libraries -#include #include #include From e82fbb40f71afbc30b7fe68753c5788a259fa91a Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 18 May 2023 21:38:24 -0400 Subject: [PATCH 032/121] int8_t -> char refactoring --- components/core/src/BufferReader.cpp | 20 +-- components/core/src/BufferReader.hpp | 36 ++---- components/core/src/BufferedFileReader.cpp | 114 +++++++++--------- components/core/src/BufferedFileReader.hpp | 14 +-- .../core/tests/test-ir_encoding_methods.cpp | 52 +++++--- 5 files changed, 119 insertions(+), 117 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 777ab5950..baba3dcf8 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -8,11 +8,17 @@ using std::string_view; -[[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { +ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (nullptr == m_data) { return ErrorCode_NotInit; } - pos = m_cursor_pos; + if (nullptr == buf) { + return ErrorCode_BadParam; + } + + num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); + memcpy(buf, m_data + m_cursor_pos, num_bytes_read); + m_cursor_pos += num_bytes_read; return ErrorCode_Success; } @@ -27,17 +33,11 @@ using std::string_view; return ErrorCode_Success; } -ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +[[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { if (nullptr == m_data) { return ErrorCode_NotInit; } - if (nullptr == buf) { - return ErrorCode_BadParam; - } - - num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); - memcpy(buf, m_data + m_cursor_pos, num_bytes_read); - m_cursor_pos += num_bytes_read; + pos = m_cursor_pos; return ErrorCode_Success; } diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index ae9b0586e..d73a403fa 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -1,11 +1,6 @@ - #ifndef BufferReader_HPP #define BufferReader_HPP -// C standard libraries - -// C++ standard libraries - // Project headers #include "ReaderInterface.hpp" @@ -23,24 +18,16 @@ class BufferReader : public ReaderInterface { return "BufferReader operation failed"; } }; - BufferReader () : m_data(nullptr), - m_size(0), - m_cursor_pos(0) {} - BufferReader (const int8_t* data, size_t size) : - m_data(data), - m_size(size), - m_cursor_pos(0) {} + // Constructors + BufferReader () : m_data(nullptr), m_size(0), m_cursor_pos(0) {} + BufferReader (const char* data, size_t size) : m_data(data), m_size(size), m_cursor_pos(0) {} + + // Methods [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; - [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; - [[nodiscard]] size_t get_buffer_length() { return m_size; } - void reset_buffer (const int8_t* data, size_t size) { - m_data = data; - m_size = size; - m_cursor_pos = 0; - } + [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; /** * Tries reading a string view of size = read_size from the ir_buf. @@ -50,14 +37,15 @@ class BufferReader : public ReaderInterface { * data to decode **/ [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); - -protected: - void reset_buffer (const int8_t* data, size_t size, size_t cursor_pos) { + [[nodiscard]] size_t get_buffer_length() { return m_size; } + void reset_buffer (const char* data, size_t size) { m_data = data; m_size = size; - m_cursor_pos = cursor_pos; + m_cursor_pos = 0; } - const int8_t* m_data; + +private: + const char* m_data; size_t m_size; size_t m_cursor_pos; }; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 6a6378bc0..03e1f0298 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -13,13 +13,12 @@ // Project headers #include -#include "Utils.hpp" using std::make_unique; using std::move; using std::string; -static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, +static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read); BufferedFileReader::BufferedFileReader () { @@ -31,7 +30,7 @@ BufferedFileReader::BufferedFileReader () { SPDLOG_ERROR("Failed to init reader buffer size to be {}", cDefaultBufferSize); throw OperationFailed(error_code, __FILENAME__, __LINE__); } - m_buffer = make_unique(m_reader_buffer_size); + m_buffer = make_unique(m_buffer_size); } BufferedFileReader::~BufferedFileReader () { @@ -39,11 +38,11 @@ BufferedFileReader::~BufferedFileReader () { } size_t BufferedFileReader::remaining_data_size () const { - if (m_size == 0) { + if (m_data_size == 0) { return 0; } - assert(m_size >= cursor_pos()); - return m_size - cursor_pos(); + assert(m_data_size >= cursor_pos()); + return m_data_size - cursor_pos(); } ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { @@ -64,27 +63,27 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, num_bytes_refilled = 0; if (false == m_checkpoint_enabled) { // recover from a previous reset - if (m_size > refill_size) { - m_buffer = make_unique(refill_size); + if (m_data_size > refill_size) { + m_buffer = make_unique(refill_size); } auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_buffer_begin_pos = m_file_pos & m_reader_buffer_aligned_mask; - m_size = num_bytes_refilled; + m_buffer_begin_pos = m_file_pos & m_buffer_aligned_mask; + m_data_size = num_bytes_refilled; } else { // Messy way of copying data from old buffer to new buffer - auto new_buffer = make_unique(m_size + refill_size); - memcpy(new_buffer.get(), m_buffer.get(), m_size); - auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_size], refill_size, + auto new_buffer = make_unique(m_data_size + refill_size); + memcpy(new_buffer.get(), m_buffer.get(), m_data_size); + auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_data_size], refill_size, num_bytes_refilled); m_buffer = std::move(new_buffer); if (error_code != ErrorCode_Success) { return error_code; } - m_size += num_bytes_refilled; + m_data_size += num_bytes_refilled; } return ErrorCode_Success; @@ -115,7 +114,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, break; } // refill the buffer if more bytes are to be read - auto error_code = refill_reader_buffer(m_reader_buffer_size); + auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { break; } else if (ErrorCode_Success != error_code) { @@ -133,55 +132,56 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (m_fd == -1) { return ErrorCode_NotInit; } - // early return path if (pos == m_file_pos) { return ErrorCode_Success; } if (pos <= m_file_pos) { if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Seek back not allowed when checkpoint is not enabled"); + SPDLOG_ERROR("Error: Seek back when checkpoint is not enabled"); return ErrorCode_Failure; } if (pos < m_checkpoint_pos) { - SPDLOG_ERROR("Seek back before the checkpoint is not supported"); + SPDLOG_ERROR("Error: trying to seek to {} which is ahead of checkpoint: {}", + pos, m_checkpoint_pos); return ErrorCode_Failure; } } else { auto buffer_available_data = remaining_data_size(); auto seek_distance = pos - m_file_pos; if (seek_distance <= buffer_available_data) { - } else if (false == m_checkpoint_enabled) { - m_buffer_begin_pos = pos & m_reader_buffer_aligned_mask; + m_file_pos = pos; + return ErrorCode_Success; + } + // Handle the case where buffer doesn't contain enough data for seek + if (false == m_checkpoint_enabled) { + m_buffer_begin_pos = pos & m_buffer_aligned_mask; auto offset = lseek(m_fd, m_buffer_begin_pos, SEEK_SET); if (offset == -1) { return ErrorCode_errno; } // invalidate buffered_data - m_size = 0; + m_data_size = 0; } else { - size_t num_bytes_to_refill = seek_distance - buffer_available_data; + // Note: we can safely assume that m_size will be a multiple of + // m_reader_buffer_size. if m_size is not a multiple of + // m_reader_buffer_size, if must mean the file has reached EoF + // and the code will throw an error anyway + size_t num_bytes_to_refill = pos - m_buffer_begin_pos + m_data_size; + size_t quantizied_refill_size = + (1 + ((num_bytes_to_refill - 1) >> m_buffer_exp)) << m_buffer_exp; size_t num_bytes_refilled {0}; - while (true) { - auto error_code = refill_reader_buffer(m_reader_buffer_size, num_bytes_refilled); - if (ErrorCode_EndOfFile == error_code) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); - throw; - } - if (ErrorCode_Success != error_code) { - return error_code; - } - if (num_bytes_to_refill <= m_reader_buffer_size) { - break; - } - // do we need to distinguish num_bytes_refilled vs m_reader_buffer_size - // since we might anyway terminate the loop? - num_bytes_to_refill -= num_bytes_refilled; + auto error_code = refill_reader_buffer(quantizied_refill_size, num_bytes_refilled); + if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); + } + if (ErrorCode_Success != error_code) { + return error_code; } } } - // update the m_file_pos at success m_file_pos = pos; return ErrorCode_Success; } @@ -192,8 +192,8 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha return ErrorCode_NotInit; } // Refill the buffer if necessary - if (0 == m_size) { - auto error_code = refill_reader_buffer(m_reader_buffer_size); + if (0 == m_data_size) { + auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_Success != error_code) { data_ptr = nullptr; peek_size = 0; @@ -218,7 +218,7 @@ ErrorCode BufferedFileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - m_size = 0; + m_data_size = 0; m_buffer_begin_pos = 0; return ErrorCode_Success; } @@ -237,21 +237,21 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim size_t delim_pos; while (false == found_delim) { // find the pointer pointing to the delimiter - const auto* delim_ptr = reinterpret_cast(memchr(buffer_head(), delim, - remaining_data_size())); + const auto* delim_ptr = reinterpret_cast(memchr(buffer_head(), delim, + remaining_data_size())); if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_buffer.get()) + 1; found_delim = true; } else { - delim_pos = m_size; + delim_pos = m_data_size; } // append to strings size_t str_length = delim_pos - cursor_pos(); str.append(reinterpret_cast(buffer_head()), str_length); - // increment file pos to the delimiter or the end of file + m_file_pos += str_length; if (false == found_delim) { - if (auto error_code = refill_reader_buffer(m_reader_buffer_size); + if (auto error_code = refill_reader_buffer(m_buffer_size); ErrorCode_Success != error_code) { return error_code; } @@ -294,15 +294,15 @@ void BufferedFileReader::reset_checkpoint () { if (false == m_checkpoint_enabled) { return; } - if (m_size != m_reader_buffer_size) { + if (m_data_size != m_buffer_size) { // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = cursor_pos() & m_reader_buffer_aligned_mask; - m_size -= copy_pos; + auto copy_pos = cursor_pos() & m_buffer_aligned_mask; + m_data_size -= copy_pos; // Use a quantized size for the new buffer size - auto new_buffer_size = (1 + ((m_size - 1) >> m_reader_buffer_exp)) << m_reader_buffer_exp; + auto new_buffer_size = (1 + ((m_data_size - 1) >> m_buffer_exp)) << m_buffer_exp; - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), &m_buffer[copy_pos], m_size); + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), &m_buffer[copy_pos], m_data_size); m_buffer = std::move(new_buffer); m_buffer_begin_pos += copy_pos; } @@ -329,9 +329,9 @@ ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { return ErrorCode_BadParam; } - m_reader_buffer_exp = static_cast(exponent); - m_reader_buffer_size = buffer_size; - m_reader_buffer_aligned_mask = ~(m_reader_buffer_size - 1); + m_buffer_exp = static_cast(exponent); + m_buffer_size = buffer_size; + m_buffer_aligned_mask = ~(m_buffer_size - 1); return ErrorCode_Success; } @@ -344,7 +344,7 @@ void BufferedFileReader::close () { if (m_checkpoint_enabled) { SPDLOG_DEBUG("close file without resetting checkpoint"); - m_buffer = make_unique(m_reader_buffer_size); + m_buffer = make_unique(m_buffer_size); m_checkpoint_enabled = false; } } @@ -362,7 +362,7 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { return ErrorCode_Success; } -static ErrorCode try_read_into_buffer(int fd, int8_t* buffer, size_t num_bytes_to_read, +static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; // keep reading from the fd until seeing a 0, which means eof diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index bcfb0ac38..216c895e3 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -121,7 +121,7 @@ class BufferedFileReader : public ReaderInterface { private: [[nodiscard]] size_t cursor_pos() const { return m_file_pos - m_buffer_begin_pos; } [[nodiscard]] size_t remaining_data_size() const; - [[nodiscard]] int8_t* buffer_head() const { return m_buffer.get() + cursor_pos(); } + [[nodiscard]] char* buffer_head() const { return m_buffer.get() + cursor_pos(); } [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); @@ -131,14 +131,14 @@ class BufferedFileReader : public ReaderInterface { std::string m_path; // Buffer specific data - std::unique_ptr m_buffer; - size_t m_size; + std::unique_ptr m_buffer; + size_t m_data_size; size_t m_buffer_begin_pos; - // constant flag - size_t m_reader_buffer_exp; - size_t m_reader_buffer_size; - size_t m_reader_buffer_aligned_mask; + // constant + size_t m_buffer_exp; + size_t m_buffer_size; + size_t m_buffer_aligned_mask; // checkpoint specific data bool m_checkpoint_enabled; size_t m_checkpoint_pos; diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 897a5ed03..0736be5ba 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -212,7 +212,8 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { EightByteEncodingMagicNumber + MagicNumberLength}; // Test eight-byte encoding - ir_buffer.reset_buffer(eight_byte_encoding_vec.data(), eight_byte_encoding_vec.size()); + ir_buffer.reset_buffer(reinterpret_cast(eight_byte_encoding_vec.data()), + eight_byte_encoding_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -221,24 +222,28 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - ir_buffer.reset_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size()); + ir_buffer.reset_buffer(reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test error on empty and incomplete ir_buffer const vector empty_ir_vec; - ir_buffer.reset_buffer(empty_ir_vec.data(), empty_ir_vec.size()); + ir_buffer.reset_buffer(reinterpret_cast(empty_ir_vec.data()), + empty_ir_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); - ir_buffer.reset_buffer(four_byte_encoding_vec.data(), four_byte_encoding_vec.size() - 1); + ir_buffer.reset_buffer(reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() - 1); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - ir_buffer.reset_buffer(invalid_ir_vec.data(), invalid_ir_vec.size()); + ir_buffer.reset_buffer(reinterpret_cast(invalid_ir_vec.data()), + invalid_ir_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -258,7 +263,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - ir_buffer.reset_buffer(ir_buf.data(), ir_buf.size()); + ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); bool is_four_bytes_encoding; REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); @@ -297,7 +302,8 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if incomplete IR can be detected ir_buf.resize(encoded_preamble_end_pos - 1); - BufferReader incomplete_preamble_buffer(ir_buf.data(), ir_buf.size()); + BufferReader incomplete_preamble_buffer(reinterpret_cast(ir_buf.data()), + ir_buf.size()); incomplete_preamble_buffer.seek_from_begin(MagicNumberLength); REQUIRE(decode_preamble( incomplete_preamble_buffer, metadata_type, metadata_pos, metadata_size) == @@ -306,7 +312,8 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; ir_buffer.seek_from_begin(MagicNumberLength); - BufferReader corrupted_preamble_buffer(ir_buf.data(), ir_buf.size()); + BufferReader corrupted_preamble_buffer(reinterpret_cast(ir_buf.data()), + ir_buf.size()); REQUIRE(decode_preamble( corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -316,6 +323,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", four_byte_encoded_variable_t, eight_byte_encoded_variable_t) { vector ir_buf; + BufferReader ir_buffer; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; @@ -326,24 +334,27 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", const size_t encoded_message_end_pos = ir_buf.size(); const size_t encoded_message_start_pos = 0; - BufferReader encoded_message_buffer(ir_buf.data(), ir_buf.size()); + // Test if message can be decoded properly + ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); string decoded_message; epoch_time_ms_t timestamp; REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(encoded_message_buffer, decoded_message, timestamp)); + decode_next_message(ir_buffer, decoded_message, timestamp)); REQUIRE(message == decoded_message); REQUIRE(timestamp == reference_timestamp); - REQUIRE(encoded_message_buffer.get_pos() == encoded_message_end_pos); + REQUIRE(ir_buffer.get_pos() == encoded_message_end_pos); - encoded_message_buffer.seek_from_begin(encoded_message_start_pos + 1); + // Test corrupted IR + ir_buffer.seek_from_begin(encoded_message_start_pos + 1); REQUIRE(IRErrorCode::IRErrorCode_Corrupted_IR == - decode_next_message(encoded_message_buffer, message, timestamp)); + decode_next_message(ir_buffer, message, timestamp)); + // Test incomplete IR ir_buf.resize(encoded_message_end_pos - 4); - BufferReader incomplete_message_buffer(ir_buf.data(), ir_buf.size()); + ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR == - decode_next_message(incomplete_message_buffer, message, timestamp)); + decode_next_message(ir_buffer, message, timestamp)); } // NOTE: This test only tests eight_byte_encoded_variable_t because we trigger @@ -374,7 +385,8 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape {ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - ir_buffer.reset_buffer(ir_with_extra_escape.data(), ir_with_extra_escape.size()); + ir_buffer.reset_buffer(reinterpret_cast(ir_with_extra_escape.data()), + ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer, decoded_message, timestamp)); @@ -383,7 +395,8 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - ir_buffer.reset_buffer(ir_with_extra_escape.data(), ir_with_extra_escape.size()); + ir_buffer.reset_buffer(reinterpret_cast(ir_with_extra_placeholder.data()), + ir_with_extra_placeholder.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer, decoded_message, timestamp)); @@ -399,7 +412,7 @@ TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_mes REQUIRE(true == encode_message(reference_delta_ts_negative, message, logtype, ir_buf)); - BufferReader ir_buffer(ir_buf.data(), ir_buf.size()); + BufferReader ir_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); string decoded_message; epoch_time_ms_t delta_ts; REQUIRE(IRErrorCode::IRErrorCode_Success == @@ -442,7 +455,8 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", reference_messages.push_back(message); reference_timestamps.push_back(ts); - BufferReader complete_encoding_buffer(ir_buf.data(), ir_buf.size()); + BufferReader complete_encoding_buffer(reinterpret_cast(ir_buf.data()), + ir_buf.size()); bool is_four_bytes_encoding; REQUIRE(get_encoding_type(complete_encoding_buffer, is_four_bytes_encoding) == From 1afc47cefb6cec263879a6188d3baeac4c7fd0c9 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 18 May 2023 22:22:57 -0400 Subject: [PATCH 033/121] further clean up --- components/core/src/BufferedFileReader.cpp | 271 +++++++++--------- components/core/src/BufferedFileReader.hpp | 5 +- .../core/tests/test-BufferedFileReader.cpp | 9 + 3 files changed, 148 insertions(+), 137 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 03e1f0298..a6a6a6ede 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -37,14 +37,6 @@ BufferedFileReader::~BufferedFileReader () { close(); } -size_t BufferedFileReader::remaining_data_size () const { - if (m_data_size == 0) { - return 0; - } - assert(m_data_size >= cursor_pos()); - return m_data_size - cursor_pos(); -} - ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { if (-1 == m_fd) { return ErrorCode_NotInit; @@ -53,81 +45,6 @@ ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { - size_t num_bytes_refilled; - return refill_reader_buffer (refill_size, num_bytes_refilled); -} - -ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, - size_t& num_bytes_refilled) { - num_bytes_refilled = 0; - if (false == m_checkpoint_enabled) { - // recover from a previous reset - if (m_data_size > refill_size) { - m_buffer = make_unique(refill_size); - } - auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), - refill_size, num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_buffer_begin_pos = m_file_pos & m_buffer_aligned_mask; - m_data_size = num_bytes_refilled; - } else { - // Messy way of copying data from old buffer to new buffer - auto new_buffer = make_unique(m_data_size + refill_size); - memcpy(new_buffer.get(), m_buffer.get(), m_data_size); - auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_data_size], refill_size, - num_bytes_refilled); - m_buffer = std::move(new_buffer); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_data_size += num_bytes_refilled; - - } - return ErrorCode_Success; -} - -ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) { - if (-1 == m_fd) { - return ErrorCode_NotInit; - } - if (nullptr == buf) { - return ErrorCode_BadParam; - } - - num_bytes_read = 0; - size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; - // keep reading until enough data is read or an eof is seen - while (true) { - auto avaiable_bytes_for_read = std::min(num_bytes_to_read_from_buffer, - remaining_data_size()); - memcpy(buf + num_bytes_read, buffer_head(), avaiable_bytes_for_read); - - num_bytes_to_read_from_buffer -= avaiable_bytes_for_read; - num_bytes_read += avaiable_bytes_for_read; - - m_file_pos += avaiable_bytes_for_read; - if (num_bytes_to_read_from_buffer == 0) { - break; - } - // refill the buffer if more bytes are to be read - auto error_code = refill_reader_buffer(m_buffer_size); - if (ErrorCode_EndOfFile == error_code) { - break; - } else if (ErrorCode_Success != error_code) { - return error_code; - } - } - - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; -} - ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (m_fd == -1) { return ErrorCode_NotInit; @@ -163,15 +80,11 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { // invalidate buffered_data m_data_size = 0; } else { - // Note: we can safely assume that m_size will be a multiple of - // m_reader_buffer_size. if m_size is not a multiple of - // m_reader_buffer_size, if must mean the file has reached EoF - // and the code will throw an error anyway - size_t num_bytes_to_refill = pos - m_buffer_begin_pos + m_data_size; - size_t quantizied_refill_size = - (1 + ((num_bytes_to_refill - 1) >> m_buffer_exp)) << m_buffer_exp; + size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + m_data_size); + size_t quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); + size_t num_bytes_refilled {0}; - auto error_code = refill_reader_buffer(quantizied_refill_size, num_bytes_refilled); + auto error_code = refill_reader_buffer(quantized_refill_size, num_bytes_refilled); if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { SPDLOG_ERROR("not expecting to seek pass the Entire file"); throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); @@ -186,45 +99,47 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const char*& data_ptr, - size_t& peek_size) { +ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) { if (-1 == m_fd) { return ErrorCode_NotInit; } - // Refill the buffer if necessary - if (0 == m_data_size) { + if (nullptr == buf) { + return ErrorCode_BadParam; + } + + num_bytes_read = 0; + size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; + // keep reading until enough data is read or an eof is seen + while (true) { + auto avaiable_bytes_for_read = std::min(num_bytes_to_read_from_buffer, + remaining_data_size()); + memcpy(buf + num_bytes_read, buffer_head(), avaiable_bytes_for_read); + + num_bytes_to_read_from_buffer -= avaiable_bytes_for_read; + num_bytes_read += avaiable_bytes_for_read; + + m_file_pos += avaiable_bytes_for_read; + if (num_bytes_to_read_from_buffer == 0) { + break; + } + // refill the buffer if more bytes are to be read auto error_code = refill_reader_buffer(m_buffer_size); - if (ErrorCode_Success != error_code) { - data_ptr = nullptr; - peek_size = 0; + if (ErrorCode_EndOfFile == error_code) { + break; + } else if (ErrorCode_Success != error_code) { return error_code; } } - peek_size = std::min(size_to_peek, remaining_data_size()); - data_ptr = reinterpret_cast(buffer_head()); - return ErrorCode_Success; -} -ErrorCode BufferedFileReader::try_open (const string& path) { - // Cleanup in case caller forgot to call close before calling this function - close(); - - m_fd = ::open(path.c_str(), O_RDONLY); - if (-1 == m_fd) { - if (ENOENT == errno) { - return ErrorCode_FileNotFound; - } - return ErrorCode_errno; + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; } - m_path = path; - m_file_pos = 0; - m_data_size = 0; - m_buffer_begin_pos = 0; return ErrorCode_Success; } ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, string& str) { + bool append, string& str) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -260,6 +175,24 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim return ErrorCode_Success; } +ErrorCode BufferedFileReader::try_open (const string& path) { + // Cleanup in case caller forgot to call close before calling this function + close(); + + m_fd = ::open(path.c_str(), O_RDONLY); + if (-1 == m_fd) { + if (ENOENT == errno) { + return ErrorCode_FileNotFound; + } + return ErrorCode_errno; + } + m_path = path; + m_file_pos = 0; + m_data_size = 0; + m_buffer_begin_pos = 0; + return ErrorCode_Success; +} + void BufferedFileReader::open (const string& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { @@ -271,6 +204,33 @@ void BufferedFileReader::open (const string& path) { } } +void BufferedFileReader::close () { + if (-1 != m_fd) { + // NOTE: We don't check errors for fclose since it seems + // the only reason it could fail is if it was interrupted by a signal + ::close(m_fd); + m_fd = -1; + + if (m_checkpoint_enabled) { + SPDLOG_DEBUG("close file without resetting checkpoint"); + m_buffer = make_unique(m_buffer_size); + m_checkpoint_enabled = false; + } + } +} + +ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { + if (-1 == m_fd) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + auto return_value = fstat(m_fd, &stat_buffer); + if (0 != return_value) { + return ErrorCode_errno; + } + return ErrorCode_Success; +} + void BufferedFileReader::revert_pos() { if (false == m_checkpoint_enabled) { SPDLOG_ERROR("Checkpoint is not enabled"); @@ -299,7 +259,7 @@ void BufferedFileReader::reset_checkpoint () { auto copy_pos = cursor_pos() & m_buffer_aligned_mask; m_data_size -= copy_pos; // Use a quantized size for the new buffer size - auto new_buffer_size = (1 + ((m_data_size - 1) >> m_buffer_exp)) << m_buffer_exp; + auto new_buffer_size = quantize_to_buffer_size(m_data_size); auto new_buffer = make_unique(new_buffer_size); memcpy(new_buffer.get(), &m_buffer[copy_pos], m_data_size); @@ -335,33 +295,74 @@ ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { return ErrorCode_Success; } -void BufferedFileReader::close () { - if (-1 != m_fd) { - // NOTE: We don't check errors for fclose since it seems - // the only reason it could fail is if it was interrupted by a signal - ::close(m_fd); - m_fd = -1; - - if (m_checkpoint_enabled) { - SPDLOG_DEBUG("close file without resetting checkpoint"); - m_buffer = make_unique(m_buffer_size); - m_checkpoint_enabled = false; +ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const char*& data_ptr, + size_t& peek_size) { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + // Refill the buffer if necessary + if (0 == m_data_size) { + auto error_code = refill_reader_buffer(m_buffer_size); + if (ErrorCode_Success != error_code) { + data_ptr = nullptr; + peek_size = 0; + return error_code; } } + peek_size = std::min(size_to_peek, remaining_data_size()); + data_ptr = reinterpret_cast(buffer_head()); + return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { - if (-1 == m_fd) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); +size_t BufferedFileReader::remaining_data_size () const { + if (m_data_size == 0) { + return 0; } + assert(m_data_size >= cursor_pos()); + return m_data_size - cursor_pos(); +} + +size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { + return (1 + ((size - 1) >> m_buffer_exp)) << m_buffer_exp; +} + +ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { + size_t num_bytes_refilled; + return refill_reader_buffer (refill_size, num_bytes_refilled); +} + +ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, + size_t& num_bytes_refilled) { + num_bytes_refilled = 0; + if (false == m_checkpoint_enabled) { + // recover from a previous reset if necessary + if (m_data_size > refill_size) { + m_buffer = make_unique(refill_size); + } + auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), + refill_size, num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer_begin_pos = m_file_pos & m_buffer_aligned_mask; + m_data_size = num_bytes_refilled; + } else { + // Messy way of copying data from old buffer to new buffer + auto new_buffer = make_unique(m_data_size + refill_size); + memcpy(new_buffer.get(), m_buffer.get(), m_data_size); + auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_data_size], refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer = std::move(new_buffer); + m_data_size += num_bytes_refilled; - auto return_value = fstat(m_fd, &stat_buffer); - if (0 != return_value) { - return ErrorCode_errno; } return ErrorCode_Success; } + static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 216c895e3..df259db5f 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -120,11 +120,12 @@ class BufferedFileReader : public ReaderInterface { private: [[nodiscard]] size_t cursor_pos() const { return m_file_pos - m_buffer_begin_pos; } - [[nodiscard]] size_t remaining_data_size() const; [[nodiscard]] char* buffer_head() const { return m_buffer.get() + cursor_pos(); } + + [[nodiscard]] size_t remaining_data_size() const; + [[nodiscard]] size_t quantize_to_buffer_size(size_t size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); - // Types size_t m_file_pos; int m_fd; diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 670ade836..e4668530f 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -179,6 +179,15 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + + // test a seek that reaches the end of the file + num_bytes_to_read = 500; + seek_pos = test_data_size - num_bytes_to_read; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, + num_byte_read)); + REQUIRE(num_bytes_to_read == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); } delete[] test_data; From 7868252c0b6836043c326efbf75b660d5bd88e8b Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 18 May 2023 22:42:24 -0400 Subject: [PATCH 034/121] remove function that is unnecessary --- components/core/src/BufferReader.hpp | 1 - components/core/src/ffi/ir_stream/decoding_methods.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index d73a403fa..7f9ccf2c4 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -37,7 +37,6 @@ class BufferReader : public ReaderInterface { * data to decode **/ [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); - [[nodiscard]] size_t get_buffer_length() { return m_size; } void reset_buffer (const char* data, size_t size) { m_data = data; m_size = size; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 0e208d94c..c674f99e2 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -463,9 +463,6 @@ namespace ffi::ir_stream { } IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding) { - if (ir_buf.get_buffer_length() == 0) { - return IRErrorCode_Incomplete_IR; - } char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); From 10c1bad5b944db42a67236f4bfa778aa9176f954 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 20 May 2023 20:18:17 -0400 Subject: [PATCH 035/121] checkpoint for profiling stuff --- components/core/CMakeLists.txt | 211 ++++++++++++++++++ components/core/src/BufferReader.cpp | 13 +- components/core/src/BufferReader.hpp | 10 +- components/core/src/BufferedFileReader.cpp | 23 ++ components/core/src/BufferedFileReader.hpp | 17 +- .../core/src/BufferedReaderInterface.cpp | 8 + .../core/src/BufferedReaderInterface.hpp | 28 +++ .../src/ffi/ir_stream/decoding_methods.cpp | 84 +++---- .../src/ffi/ir_stream/decoding_methods.hpp | 10 +- .../core/tests/test-ir_encoding_methods.cpp | 12 +- 10 files changed, 358 insertions(+), 58 deletions(-) create mode 100644 components/core/src/BufferedReaderInterface.cpp create mode 100644 components/core/src/BufferedReaderInterface.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 262ce172d..8e0fb79d0 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -856,3 +856,214 @@ target_compile_features(unitTest ) include(cmake/utils.cmake) + +set(SOURCE_FILES_testbench + src/BufferReader.cpp + src/BufferReader.hpp + src/BufferedFileReader.cpp + src/BufferedFileReader.hpp + src/clp/CommandLineArguments.cpp + src/clp/CommandLineArguments.hpp + src/clp/compression.cpp + src/clp/compression.hpp + src/clp/decompression.cpp + src/clp/decompression.hpp + src/clp/FileCompressor.cpp + src/clp/FileCompressor.hpp + src/clp/FileDecompressor.cpp + src/clp/FileDecompressor.hpp + src/clp/FileToCompress.cpp + src/clp/FileToCompress.hpp + src/clp/run.cpp + src/clp/run.hpp + src/clp/StructuredFileToCompress.cpp + src/clp/StructuredFileToCompress.hpp + src/clp/utils.cpp + src/clp/utils.hpp + src/compressor_frontend/Constants.hpp + src/compressor_frontend/finite_automata/RegexAST.hpp + src/compressor_frontend/finite_automata/RegexAST.tpp + src/compressor_frontend/finite_automata/RegexDFA.hpp + src/compressor_frontend/finite_automata/RegexDFA.tpp + src/compressor_frontend/finite_automata/RegexNFA.hpp + src/compressor_frontend/finite_automata/RegexNFA.tpp + src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp + src/compressor_frontend/finite_automata/UnicodeIntervalTree.tpp + src/compressor_frontend/LALR1Parser.cpp + src/compressor_frontend/LALR1Parser.hpp + src/compressor_frontend/LALR1Parser.tpp + src/compressor_frontend/Lexer.hpp + src/compressor_frontend/Lexer.tpp + src/compressor_frontend/LogParser.cpp + src/compressor_frontend/LogParser.hpp + src/compressor_frontend/SchemaParser.cpp + src/compressor_frontend/SchemaParser.hpp + src/compressor_frontend/Token.cpp + src/compressor_frontend/Token.hpp + src/compressor_frontend/utils.cpp + src/compressor_frontend/utils.hpp + src/database_utils.cpp + src/database_utils.hpp + src/Defs.h + src/dictionary_utils.cpp + src/dictionary_utils.hpp + src/DictionaryEntry.cpp + src/DictionaryEntry.hpp + src/DictionaryReader.cpp + src/DictionaryReader.hpp + src/DictionaryWriter.cpp + src/DictionaryWriter.hpp + src/EncodedVariableInterpreter.cpp + src/EncodedVariableInterpreter.hpp + src/ErrorCode.hpp + src/ffi/encoding_methods.cpp + src/ffi/encoding_methods.hpp + src/ffi/encoding_methods.tpp + src/ffi/ir_stream/byteswap.hpp + src/ffi/ir_stream/decoding_methods.cpp + src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/encoding_methods.cpp + src/ffi/ir_stream/encoding_methods.hpp + src/ffi/ir_stream/protocol_constants.hpp + src/ffi/search/CompositeWildcardToken.cpp + src/ffi/search/CompositeWildcardToken.hpp + src/ffi/search/ExactVariableToken.cpp + src/ffi/search/ExactVariableToken.hpp + src/ffi/search/query_methods.cpp + src/ffi/search/query_methods.hpp + src/ffi/search/QueryMethodFailed.hpp + src/ffi/search/QueryToken.hpp + src/ffi/search/QueryWildcard.cpp + src/ffi/search/QueryWildcard.hpp + src/ffi/search/WildcardToken.cpp + src/ffi/search/WildcardToken.hpp + src/FileReader.cpp + src/FileReader.hpp + src/FileWriter.cpp + src/FileWriter.hpp + src/GlobalMetadataDB.cpp + src/GlobalMetadataDB.hpp + src/GlobalMetadataDBConfig.cpp + src/GlobalMetadataDBConfig.hpp + src/GlobalMySQLMetadataDB.cpp + src/GlobalMySQLMetadataDB.hpp + src/GlobalSQLiteMetadataDB.cpp + src/GlobalSQLiteMetadataDB.hpp + src/Grep.cpp + src/Grep.hpp + src/LibarchiveFileReader.cpp + src/LibarchiveFileReader.hpp + src/LibarchiveReader.cpp + src/LibarchiveReader.hpp + src/LogTypeDictionaryEntry.cpp + src/LogTypeDictionaryEntry.hpp + src/LogTypeDictionaryReader.cpp + src/LogTypeDictionaryReader.hpp + src/LogTypeDictionaryWriter.cpp + src/LogTypeDictionaryWriter.hpp + src/MessageParser.cpp + src/MessageParser.hpp + src/MySQLDB.cpp + src/MySQLDB.hpp + src/MySQLParamBindings.cpp + src/MySQLParamBindings.hpp + src/MySQLPreparedStatement.cpp + src/MySQLPreparedStatement.hpp + src/PageAllocatedVector.cpp + src/PageAllocatedVector.hpp + src/ParsedMessage.cpp + src/ParsedMessage.hpp + src/Platform.hpp + src/Profiler.cpp + src/Profiler.hpp + src/Query.cpp + src/Query.hpp + src/ReaderInterface.cpp + src/ReaderInterface.hpp + src/SQLiteDB.cpp + src/SQLiteDB.hpp + src/SQLitePreparedStatement.cpp + src/SQLitePreparedStatement.hpp + src/Stopwatch.cpp + src/Stopwatch.hpp + src/streaming_archive/Constants.hpp + src/streaming_archive/MetadataDB.cpp + src/streaming_archive/MetadataDB.hpp + src/streaming_archive/reader/Archive.cpp + src/streaming_archive/reader/Archive.hpp + src/streaming_archive/reader/File.cpp + src/streaming_archive/reader/File.hpp + src/streaming_archive/reader/Message.cpp + src/streaming_archive/reader/Message.hpp + src/streaming_archive/reader/Segment.cpp + src/streaming_archive/reader/Segment.hpp + src/streaming_archive/reader/SegmentManager.cpp + src/streaming_archive/reader/SegmentManager.hpp + src/streaming_archive/writer/Archive.cpp + src/streaming_archive/writer/Archive.hpp + src/streaming_archive/writer/File.cpp + src/streaming_archive/writer/File.hpp + src/streaming_archive/writer/Segment.cpp + src/streaming_archive/writer/Segment.hpp + src/streaming_compression/Compressor.cpp + src/streaming_compression/Compressor.hpp + src/streaming_compression/Constants.hpp + src/streaming_compression/Decompressor.cpp + src/streaming_compression/Decompressor.hpp + src/streaming_compression/passthrough/Compressor.cpp + src/streaming_compression/passthrough/Compressor.hpp + src/streaming_compression/passthrough/Decompressor.cpp + src/streaming_compression/passthrough/Decompressor.hpp + src/streaming_compression/zstd/Compressor.cpp + src/streaming_compression/zstd/Compressor.hpp + src/streaming_compression/zstd/Constants.hpp + src/streaming_compression/zstd/Decompressor.cpp + src/streaming_compression/zstd/Decompressor.hpp + src/string_utils.cpp + src/string_utils.hpp + src/string_utils.tpp + src/StringReader.cpp + src/StringReader.hpp + src/TimestampPattern.cpp + src/TimestampPattern.hpp + src/TraceableException.cpp + src/TraceableException.hpp + src/type_utils.hpp + src/Utils.cpp + src/Utils.hpp + src/VariableDictionaryEntry.cpp + src/VariableDictionaryEntry.hpp + src/VariableDictionaryReader.cpp + src/VariableDictionaryReader.hpp + src/VariableDictionaryWriter.cpp + src/VariableDictionaryWriter.hpp + src/version.hpp + src/WriterInterface.cpp + src/WriterInterface.hpp + submodules/Catch2/single_include/catch2/catch.hpp + submodules/date/include/date/date.h + submodules/json/single_include/nlohmann/json.hpp + submodules/sqlite3/sqlite3.c + submodules/sqlite3/sqlite3.h + submodules/sqlite3/sqlite3ext.h + src/temp_testbench/main.cpp + src/BufferedReaderInterface.cpp src/BufferedReaderInterface.hpp) + +add_executable(testbench + ${SOURCE_FILES_testbench} + ) +target_link_libraries(testbench + PRIVATE + Boost::filesystem Boost::iostreams Boost::program_options + fmt::fmt + LibArchive::LibArchive + MariaDBClient::MariaDBClient + spdlog::spdlog + ${sqlite_LIBRARY_DEPENDENCIES} + ${STD_FS_LIBS} + yaml-cpp::yaml-cpp + ZStd::ZStd + ) +target_compile_features(testbench + PRIVATE cxx_std_17 + ) \ No newline at end of file diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index baba3dcf8..2232aeea8 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -41,12 +41,19 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_Success; } -bool BufferReader::try_read_string_view (string_view& str_view, size_t read_size) { +bool BufferReader::try_read_string_view (MyStringView& str_view, size_t read_size) { + if (nullptr == m_data) { + return ErrorCode_NotInit; + } if ((m_cursor_pos + read_size) > m_size) { return false; } - str_view = string_view(reinterpret_cast(m_data + m_cursor_pos), - read_size); + str_view.m_buffer_pos = m_cursor_pos; + str_view.m_size = read_size; m_cursor_pos += read_size; return true; +} + +const char* BufferReader::get_buffer_ptr () { + return m_data; } \ No newline at end of file diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 7f9ccf2c4..199446f96 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -2,9 +2,9 @@ #define BufferReader_HPP // Project headers -#include "ReaderInterface.hpp" +#include "BufferedReaderInterface.hpp" -class BufferReader : public ReaderInterface { +class BufferReader : public BufferedReaderInterface { public: // Types class OperationFailed : public TraceableException { @@ -36,7 +36,11 @@ class BufferReader : public ReaderInterface { * @return true on success, false if the ir_buf doesn't contain enough * data to decode **/ - [[nodiscard]] bool try_read_string_view (std::string_view& str_view, size_t read_size); + [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, + size_t read_size) override; + + [[nodiscard]] virtual const char* get_buffer_ptr () override; + void reset_buffer (const char* data, size_t size) { m_data = data; m_size = size; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index a6a6a6ede..59faebfcc 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -362,6 +362,29 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, return ErrorCode_Success; } +bool BufferedFileReader::try_read_string_view (MyStringView& str_view, size_t read_size) { + if (-1 == m_fd) { + return false; + } + if (false == m_checkpoint_enabled) { + SPDLOG_ERROR("Can't read string view when checkpoint is not enabled"); + throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); + } + str_view.m_buffer_pos = cursor_pos(); + // try to seek to the string view pos. as to pretend that the string has + // been read passed + if (auto error_code = try_seek_from_begin(m_file_pos + read_size); + error_code != ErrorCode_Success) { + SPDLOG_ERROR("Unexpected error happened"); + throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); + } + str_view.m_size = read_size; + return true; +} + +const char* BufferedFileReader::get_buffer_ptr () { + return m_buffer.get(); +} static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index df259db5f..1df8643a7 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -11,11 +11,11 @@ // Project headers #include "Defs.h" #include "ErrorCode.hpp" -#include "ReaderInterface.hpp" +#include "BufferedReaderInterface.hpp" #include "TraceableException.hpp" -class BufferedFileReader : public ReaderInterface { +class BufferedFileReader : public BufferedReaderInterface { public: static constexpr size_t cDefaultBufferSize = 65536; // Types @@ -114,6 +114,19 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); [[nodiscard]] ErrorCode peek_buffered_data(size_t size_to_peek, const char*& data_ptr, size_t& peek_size); + + /** + * Tries reading a string view of size = read_size from the ir_buf. + * @param str_view Returns the string view + * @param read_size + * @return true on success, false if the ir_buf doesn't contain enough + * data to decode + **/ + [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, + size_t read_size) override; + + [[nodiscard]] virtual const char* get_buffer_ptr () override; + size_t mark_pos(); void revert_pos(); void reset_checkpoint (); diff --git a/components/core/src/BufferedReaderInterface.cpp b/components/core/src/BufferedReaderInterface.cpp new file mode 100644 index 000000000..b94be0da0 --- /dev/null +++ b/components/core/src/BufferedReaderInterface.cpp @@ -0,0 +1,8 @@ + +#include "BufferedReaderInterface.hpp" + +// C standard libraries + +// C++ standard libraries + +// Project headers \ No newline at end of file diff --git a/components/core/src/BufferedReaderInterface.hpp b/components/core/src/BufferedReaderInterface.hpp new file mode 100644 index 000000000..b0f38f816 --- /dev/null +++ b/components/core/src/BufferedReaderInterface.hpp @@ -0,0 +1,28 @@ +#ifndef newReaderInterface_HPP +#define newReaderInterface_HPP + +// C standard libraries + +// C++ standard libraries + +// Project headers +#include "ReaderInterface.hpp" + +class BufferedReaderInterface : public ReaderInterface { +public: + // Types + class MyStringView { + public: + MyStringView(): m_buffer_pos(0), m_size(0) {} + MyStringView(size_t pos, size_t size) : + m_buffer_pos(pos), m_size(size) {} + // variable + size_t m_buffer_pos; + size_t m_size; + }; + [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, size_t read_size) = 0; + [[nodiscard]] virtual const char* get_buffer_ptr () = 0; +}; + + +#endif // newReaderInterface_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index c674f99e2..195d0ab0e 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -8,6 +8,7 @@ using std::is_same_v; using std::string; using std::string_view; using std::vector; +using MyStringView = BufferedReaderInterface::MyStringView; namespace ffi::ir_stream { /** @@ -29,7 +30,7 @@ namespace ffi::ir_stream { * to decode */ template - static bool decode_int (BufferReader& ir_buf, integer_t& value); + static bool decode_int (BufferedReaderInterface& ir_buf, integer_t& value); /** * Decodes the next logtype string from ir_buf @@ -41,8 +42,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode parse_logtype (BufferReader& ir_buf, encoded_tag_t encoded_tag, - string_view& logtype); + static IRErrorCode parse_logtype (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + MyStringView& logtype); /** * Decodes the next dictionary-type variable string from ir_buf @@ -54,8 +55,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (BufferReader& ir_buf, encoded_tag_t encoded_tag, - string_view& dict_var); + static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + MyStringView& dict_var); /** * Parses the next timestamp from ir_buf @@ -71,7 +72,7 @@ namespace ffi::ir_stream { * to decode */ template - IRErrorCode parse_timestamp (BufferReader& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); + IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); /** * Decodes the next encoded message from ir_buf @@ -89,7 +90,7 @@ namespace ffi::ir_stream { * to decode */ template - static IRErrorCode generic_decode_next_message (BufferReader& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp); /** @@ -103,7 +104,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode read_metadata_info (BufferReader& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size); /** @@ -120,9 +121,10 @@ namespace ffi::ir_stream { */ template static string decode_message ( - string_view logtype, + MyStringView& logtype, + const char* buffer_begin_ptr, const vector& encoded_vars, - const vector& dictionary_vars + const vector& dictionary_vars ); template @@ -152,7 +154,7 @@ namespace ffi::ir_stream { } template - static bool decode_int (BufferReader& ir_buf, integer_t& value) { + static bool decode_int (BufferedReaderInterface& ir_buf, integer_t& value) { integer_t value_small_endian; if (ir_buf.try_read_numeric_value(value_small_endian) != ErrorCode_Success) { return false; @@ -172,8 +174,8 @@ namespace ffi::ir_stream { return true; } - static IRErrorCode parse_logtype (BufferReader& ir_buf, encoded_tag_t encoded_tag, - string_view& logtype) + static IRErrorCode parse_logtype (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + MyStringView& logtype) { size_t logtype_length; if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { @@ -204,8 +206,8 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - static IRErrorCode parse_dictionary_var (BufferReader& ir_buf, encoded_tag_t encoded_tag, - string_view& dict_var) { + static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + MyStringView& dict_var) { // Decode variable's length size_t var_length; if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { @@ -239,7 +241,7 @@ namespace ffi::ir_stream { } template - IRErrorCode parse_timestamp (BufferReader& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) + IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { static_assert(is_same_v || is_same_v); @@ -278,7 +280,7 @@ namespace ffi::ir_stream { } template - static IRErrorCode generic_decode_next_message (BufferReader& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp) { encoded_tag_t encoded_tag; @@ -291,9 +293,9 @@ namespace ffi::ir_stream { // Handle variables vector encoded_vars; - vector dict_vars; + vector dict_vars; encoded_variable_t encoded_variable; - string_view var_str; + MyStringView var_str; bool is_encoded_var; while (is_variable_tag(encoded_tag, is_encoded_var)) { if (is_encoded_var) { @@ -315,7 +317,7 @@ namespace ffi::ir_stream { } // Handle logtype - string_view logtype; + MyStringView logtype; if (auto error_code = parse_logtype(ir_buf, encoded_tag, logtype); IRErrorCode_Success != error_code) { @@ -334,14 +336,15 @@ namespace ffi::ir_stream { } try { - message = decode_message(logtype, encoded_vars, dict_vars); + auto buffer_begin_ptr = ir_buf.get_buffer_ptr(); + message = decode_message(logtype, buffer_begin_ptr, encoded_vars, dict_vars); } catch (const EncodingException& e) { return IRErrorCode_Decode_Error; } return IRErrorCode_Success; } - static IRErrorCode read_metadata_info (BufferReader& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size) { if (ErrorCode_Success != ir_buf.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; @@ -375,9 +378,10 @@ namespace ffi::ir_stream { template static string decode_message ( - string_view logtype, + MyStringView& logtype, + const char* buffer_begin_ptr, const vector& encoded_vars, - const vector& dictionary_vars + const vector& dictionary_vars ) { string message; size_t encoded_vars_length = encoded_vars.size(); @@ -386,11 +390,12 @@ namespace ffi::ir_stream { size_t dictionary_vars_ix = 0; size_t encoded_vars_ix = 0; - for (size_t cur_pos = 0; cur_pos < logtype.length(); ++cur_pos) { - auto c = logtype[cur_pos]; + auto logtype_ptr = buffer_begin_ptr + logtype.m_buffer_pos; + for (size_t cur_pos = 0; cur_pos < logtype.m_size; ++cur_pos) { + auto c = logtype_ptr[cur_pos]; switch(c) { case enum_to_underlying_type(VariablePlaceholder::Float): { - message.append(logtype, next_static_text_begin_pos, + message.append(logtype_ptr + next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (encoded_vars_ix >= encoded_vars_length) { @@ -404,7 +409,7 @@ namespace ffi::ir_stream { } case enum_to_underlying_type(VariablePlaceholder::Integer): { - message.append(logtype, next_static_text_begin_pos, + message.append(logtype_ptr + next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (encoded_vars_ix >= encoded_vars_length) { @@ -418,14 +423,16 @@ namespace ffi::ir_stream { } case enum_to_underlying_type(VariablePlaceholder::Dictionary): { - message.append(logtype, next_static_text_begin_pos, + message.append(logtype_ptr + next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (dictionary_vars_ix >= dict_vars_length) { throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, cTooFewDictionaryVarsErrorMessage); } - message.append(dictionary_vars[dictionary_vars_ix]); + auto offset = dictionary_vars[dictionary_vars_ix].m_buffer_pos; + auto size = dictionary_vars[dictionary_vars_ix].m_size; + message.append(buffer_begin_ptr + offset, size); ++dictionary_vars_ix; break; @@ -434,11 +441,11 @@ namespace ffi::ir_stream { case cVariablePlaceholderEscapeCharacter: { // Ensure the escape character is followed by a // character that's being escaped - if (cur_pos == logtype.length() - 1) { + if (cur_pos == logtype.m_size - 1) { throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, cUnexpectedEscapeCharacterMessage); } - message.append(logtype, next_static_text_begin_pos, + message.append(logtype_ptr + next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); // Skip the escape character @@ -455,14 +462,15 @@ namespace ffi::ir_stream { } } // Add remainder - if (next_static_text_begin_pos < logtype.length()) { - message.append(logtype, next_static_text_begin_pos); + if (next_static_text_begin_pos < logtype.m_size) { + message.append(logtype_ptr + next_static_text_begin_pos, + logtype.m_size - next_static_text_begin_pos); } return message; } - IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding) { + IRErrorCode get_encoding_type (BufferedReaderInterface& ir_buf, bool& is_four_bytes_encoding) { char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); @@ -481,7 +489,7 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); @@ -497,7 +505,7 @@ namespace ffi::ir_stream { } namespace four_byte_encoding { - IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, + IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp_delta) { return generic_decode_next_message( @@ -507,7 +515,7 @@ namespace ffi::ir_stream { } namespace eight_byte_encoding { - IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, + IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp) { return generic_decode_next_message( diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 5014320fa..7ebf7df34 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -7,7 +7,7 @@ // Project headers #include "../encoding_methods.hpp" -#include "../../BufferReader.hpp" +#include "../../BufferedReaderInterface.hpp" namespace ffi::ir_stream { using encoded_tag_t = uint8_t; @@ -28,7 +28,7 @@ namespace ffi::ir_stream { * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to * decode */ - IRErrorCode get_encoding_type (BufferReader& ir_buf, bool& is_four_bytes_encoding); + IRErrorCode get_encoding_type (BufferedReaderInterface& ir_buf, bool& is_four_bytes_encoding); /** * Decodes the preamble for an IR stream. @@ -41,7 +41,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough * data to decode */ - IRErrorCode decode_preamble (BufferReader& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size); namespace eight_byte_encoding { @@ -58,7 +58,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (BufferReader& ir_buf, std::string& message, + IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp); } @@ -76,7 +76,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (BufferReader& ir_buf, std::string& message, + IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp_delta); } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 0736be5ba..51ca28c77 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -278,10 +278,9 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); - - string_view json_metadata; - REQUIRE(ErrorCode_Success == ir_buffer.try_seek_from_begin(metadata_pos)); - REQUIRE(ir_buffer.try_read_string_view(json_metadata, metadata_size)); + + auto json_metadata_ptr = ir_buffer.get_buffer_ptr() + metadata_pos; + string_view json_metadata {json_metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == @@ -472,9 +471,8 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == complete_encoding_buffer.get_pos()); - string_view json_metadata; - REQUIRE(ErrorCode_Success == complete_encoding_buffer.try_seek_from_begin(metadata_pos)); - REQUIRE(complete_encoding_buffer.try_read_string_view(json_metadata, metadata_size)); + auto json_metadata_ptr = complete_encoding_buffer.get_buffer_ptr() + metadata_pos; + string_view json_metadata {json_metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); From 9119493dd00b52763d753d239dd4c04941baf91e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sat, 20 May 2023 20:18:17 -0400 Subject: [PATCH 036/121] adding more comments and small refactoring --- components/core/CMakeLists.txt | 211 ------------------ components/core/src/BufferReader.cpp | 2 +- components/core/src/BufferReader.hpp | 48 +++- components/core/src/BufferedFileReader.cpp | 8 - components/core/src/BufferedFileReader.hpp | 86 ++++++- .../core/src/BufferedReaderInterface.cpp | 7 - .../core/src/BufferedReaderInterface.hpp | 4 - components/core/src/LibarchiveFileReader.cpp | 11 +- components/core/src/LibarchiveFileReader.hpp | 11 +- .../src/ffi/ir_stream/decoding_methods.cpp | 18 +- 10 files changed, 141 insertions(+), 265 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 8e0fb79d0..262ce172d 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -856,214 +856,3 @@ target_compile_features(unitTest ) include(cmake/utils.cmake) - -set(SOURCE_FILES_testbench - src/BufferReader.cpp - src/BufferReader.hpp - src/BufferedFileReader.cpp - src/BufferedFileReader.hpp - src/clp/CommandLineArguments.cpp - src/clp/CommandLineArguments.hpp - src/clp/compression.cpp - src/clp/compression.hpp - src/clp/decompression.cpp - src/clp/decompression.hpp - src/clp/FileCompressor.cpp - src/clp/FileCompressor.hpp - src/clp/FileDecompressor.cpp - src/clp/FileDecompressor.hpp - src/clp/FileToCompress.cpp - src/clp/FileToCompress.hpp - src/clp/run.cpp - src/clp/run.hpp - src/clp/StructuredFileToCompress.cpp - src/clp/StructuredFileToCompress.hpp - src/clp/utils.cpp - src/clp/utils.hpp - src/compressor_frontend/Constants.hpp - src/compressor_frontend/finite_automata/RegexAST.hpp - src/compressor_frontend/finite_automata/RegexAST.tpp - src/compressor_frontend/finite_automata/RegexDFA.hpp - src/compressor_frontend/finite_automata/RegexDFA.tpp - src/compressor_frontend/finite_automata/RegexNFA.hpp - src/compressor_frontend/finite_automata/RegexNFA.tpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.tpp - src/compressor_frontend/LALR1Parser.cpp - src/compressor_frontend/LALR1Parser.hpp - src/compressor_frontend/LALR1Parser.tpp - src/compressor_frontend/Lexer.hpp - src/compressor_frontend/Lexer.tpp - src/compressor_frontend/LogParser.cpp - src/compressor_frontend/LogParser.hpp - src/compressor_frontend/SchemaParser.cpp - src/compressor_frontend/SchemaParser.hpp - src/compressor_frontend/Token.cpp - src/compressor_frontend/Token.hpp - src/compressor_frontend/utils.cpp - src/compressor_frontend/utils.hpp - src/database_utils.cpp - src/database_utils.hpp - src/Defs.h - src/dictionary_utils.cpp - src/dictionary_utils.hpp - src/DictionaryEntry.cpp - src/DictionaryEntry.hpp - src/DictionaryReader.cpp - src/DictionaryReader.hpp - src/DictionaryWriter.cpp - src/DictionaryWriter.hpp - src/EncodedVariableInterpreter.cpp - src/EncodedVariableInterpreter.hpp - src/ErrorCode.hpp - src/ffi/encoding_methods.cpp - src/ffi/encoding_methods.hpp - src/ffi/encoding_methods.tpp - src/ffi/ir_stream/byteswap.hpp - src/ffi/ir_stream/decoding_methods.cpp - src/ffi/ir_stream/decoding_methods.hpp - src/ffi/ir_stream/encoding_methods.cpp - src/ffi/ir_stream/encoding_methods.hpp - src/ffi/ir_stream/protocol_constants.hpp - src/ffi/search/CompositeWildcardToken.cpp - src/ffi/search/CompositeWildcardToken.hpp - src/ffi/search/ExactVariableToken.cpp - src/ffi/search/ExactVariableToken.hpp - src/ffi/search/query_methods.cpp - src/ffi/search/query_methods.hpp - src/ffi/search/QueryMethodFailed.hpp - src/ffi/search/QueryToken.hpp - src/ffi/search/QueryWildcard.cpp - src/ffi/search/QueryWildcard.hpp - src/ffi/search/WildcardToken.cpp - src/ffi/search/WildcardToken.hpp - src/FileReader.cpp - src/FileReader.hpp - src/FileWriter.cpp - src/FileWriter.hpp - src/GlobalMetadataDB.cpp - src/GlobalMetadataDB.hpp - src/GlobalMetadataDBConfig.cpp - src/GlobalMetadataDBConfig.hpp - src/GlobalMySQLMetadataDB.cpp - src/GlobalMySQLMetadataDB.hpp - src/GlobalSQLiteMetadataDB.cpp - src/GlobalSQLiteMetadataDB.hpp - src/Grep.cpp - src/Grep.hpp - src/LibarchiveFileReader.cpp - src/LibarchiveFileReader.hpp - src/LibarchiveReader.cpp - src/LibarchiveReader.hpp - src/LogTypeDictionaryEntry.cpp - src/LogTypeDictionaryEntry.hpp - src/LogTypeDictionaryReader.cpp - src/LogTypeDictionaryReader.hpp - src/LogTypeDictionaryWriter.cpp - src/LogTypeDictionaryWriter.hpp - src/MessageParser.cpp - src/MessageParser.hpp - src/MySQLDB.cpp - src/MySQLDB.hpp - src/MySQLParamBindings.cpp - src/MySQLParamBindings.hpp - src/MySQLPreparedStatement.cpp - src/MySQLPreparedStatement.hpp - src/PageAllocatedVector.cpp - src/PageAllocatedVector.hpp - src/ParsedMessage.cpp - src/ParsedMessage.hpp - src/Platform.hpp - src/Profiler.cpp - src/Profiler.hpp - src/Query.cpp - src/Query.hpp - src/ReaderInterface.cpp - src/ReaderInterface.hpp - src/SQLiteDB.cpp - src/SQLiteDB.hpp - src/SQLitePreparedStatement.cpp - src/SQLitePreparedStatement.hpp - src/Stopwatch.cpp - src/Stopwatch.hpp - src/streaming_archive/Constants.hpp - src/streaming_archive/MetadataDB.cpp - src/streaming_archive/MetadataDB.hpp - src/streaming_archive/reader/Archive.cpp - src/streaming_archive/reader/Archive.hpp - src/streaming_archive/reader/File.cpp - src/streaming_archive/reader/File.hpp - src/streaming_archive/reader/Message.cpp - src/streaming_archive/reader/Message.hpp - src/streaming_archive/reader/Segment.cpp - src/streaming_archive/reader/Segment.hpp - src/streaming_archive/reader/SegmentManager.cpp - src/streaming_archive/reader/SegmentManager.hpp - src/streaming_archive/writer/Archive.cpp - src/streaming_archive/writer/Archive.hpp - src/streaming_archive/writer/File.cpp - src/streaming_archive/writer/File.hpp - src/streaming_archive/writer/Segment.cpp - src/streaming_archive/writer/Segment.hpp - src/streaming_compression/Compressor.cpp - src/streaming_compression/Compressor.hpp - src/streaming_compression/Constants.hpp - src/streaming_compression/Decompressor.cpp - src/streaming_compression/Decompressor.hpp - src/streaming_compression/passthrough/Compressor.cpp - src/streaming_compression/passthrough/Compressor.hpp - src/streaming_compression/passthrough/Decompressor.cpp - src/streaming_compression/passthrough/Decompressor.hpp - src/streaming_compression/zstd/Compressor.cpp - src/streaming_compression/zstd/Compressor.hpp - src/streaming_compression/zstd/Constants.hpp - src/streaming_compression/zstd/Decompressor.cpp - src/streaming_compression/zstd/Decompressor.hpp - src/string_utils.cpp - src/string_utils.hpp - src/string_utils.tpp - src/StringReader.cpp - src/StringReader.hpp - src/TimestampPattern.cpp - src/TimestampPattern.hpp - src/TraceableException.cpp - src/TraceableException.hpp - src/type_utils.hpp - src/Utils.cpp - src/Utils.hpp - src/VariableDictionaryEntry.cpp - src/VariableDictionaryEntry.hpp - src/VariableDictionaryReader.cpp - src/VariableDictionaryReader.hpp - src/VariableDictionaryWriter.cpp - src/VariableDictionaryWriter.hpp - src/version.hpp - src/WriterInterface.cpp - src/WriterInterface.hpp - submodules/Catch2/single_include/catch2/catch.hpp - submodules/date/include/date/date.h - submodules/json/single_include/nlohmann/json.hpp - submodules/sqlite3/sqlite3.c - submodules/sqlite3/sqlite3.h - submodules/sqlite3/sqlite3ext.h - src/temp_testbench/main.cpp - src/BufferedReaderInterface.cpp src/BufferedReaderInterface.hpp) - -add_executable(testbench - ${SOURCE_FILES_testbench} - ) -target_link_libraries(testbench - PRIVATE - Boost::filesystem Boost::iostreams Boost::program_options - fmt::fmt - LibArchive::LibArchive - MariaDBClient::MariaDBClient - spdlog::spdlog - ${sqlite_LIBRARY_DEPENDENCIES} - ${STD_FS_LIBS} - yaml-cpp::yaml-cpp - ZStd::ZStd - ) -target_compile_features(testbench - PRIVATE cxx_std_17 - ) \ No newline at end of file diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 2232aeea8..54216a107 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -56,4 +56,4 @@ bool BufferReader::try_read_string_view (MyStringView& str_view, size_t read_siz const char* BufferReader::get_buffer_ptr () { return m_data; -} \ No newline at end of file +} diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 199446f96..9d8af956d 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -14,7 +14,7 @@ class BufferReader : public BufferedReaderInterface { TraceableException (error_code, filename, line_number) {} // Methods - const char* what () const noexcept override { + [[nodiscard]] const char* what () const noexcept override { return "BufferReader operation failed"; } }; @@ -23,30 +23,64 @@ class BufferReader : public BufferedReaderInterface { BufferReader () : m_data(nullptr), m_size(0), m_cursor_pos(0) {} BufferReader (const char* data, size_t size) : m_data(data), m_size(size), m_cursor_pos(0) {} - // Methods + // Methods implementing the ReaderInterface + /** + * Tries to read up to a given number of bytes from the buffer + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCode_NotInit if the buffer is not initialized + * @return ErrorCode_BadParam if buf is invalid + * @return ErrorCode_EndOfFile if buffer doesn't contain more data + * @return ErrorCode_Success on success + */ [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + /** + * Tries to seek from the beginning of the buffer to the given position + * @param pos + * @return ErrorCode_NotInit if the buffer is not initialized + * @return ErrorCode_OutOfBounds if the given position >= the buffer's size + * @return ErrorCode_Success on success + */ [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + /** + * Tries to get the current position of the read head in the buffer + * @param pos Position of the read head in the buffer + * @return ErrorCode_NotInit if the buffer is not initialized + * @return ErrorCode_Success on success + */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + // Methods /** - * Tries reading a string view of size = read_size from the ir_buf. + * Tries reading a string view of size = read_size from the buffered data * @param str_view Returns the string view * @param read_size - * @return true on success, false if the ir_buf doesn't contain enough - * data to decode + * @return true on success, false if the BufferReader doesn't contain + * enough data **/ [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, size_t read_size) override; - [[nodiscard]] virtual const char* get_buffer_ptr () override; - + /** + * Lets the BufferReader points to data with given size. + * @param data + * @param size + **/ void reset_buffer (const char* data, size_t size) { m_data = data; m_size = size; m_cursor_pos = 0; } + /** + * Returns a pointer pointing to the data + * @return + */ + [[nodiscard]] virtual const char* get_buffer_ptr () override; + + private: const char* m_data; size_t m_size; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 59faebfcc..c4cc9efba 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -231,14 +231,6 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { return ErrorCode_Success; } -void BufferedFileReader::revert_pos() { - if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Checkpoint is not enabled"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - m_file_pos = m_checkpoint_pos; -} - size_t BufferedFileReader::mark_pos() { if (true == m_checkpoint_enabled) { SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 1df8643a7..1da83cf3d 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -67,10 +67,13 @@ class BufferedFileReader : public BufferedReaderInterface { size_t& num_bytes_read) override; /** - * Tries to read a string from the file until it reaches the specified delimiter + * Tries to read a string from the file until it reaches + * the specified delimiter * @param delim The delimiter to stop at - * @param keep_delimiter Whether to include the delimiter in the output string or not - * @param append Whether to append to the given string or replace its contents + * @param keep_delimiter Whether to include the delimiter in the + * output string or not + * @param append Whether to append to the given string or + * replace its contents * @param str The string read * @return ErrorCode_Success on success * @return ErrorCode_EndOfFile on EOF @@ -111,7 +114,31 @@ class BufferedFileReader : public BufferedReaderInterface { */ [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; + /** + * Set the internal buffer + * @param buffer_size + * @return ErrorCode_Success on success + * @return ErrorCode_Failure if the file is not closed + * @return ErrorCode_BadParam if either: + * 1. buffer_size is not a integer multiple of 4096 + * 2. buffer_size is not a power of 2 + */ [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); + + /** + * Peeks the next peek_size bytes of data without advancing the file + * pos. + * Note: If further operation such as read or peek is called on the + * BufferedFileReader after peek_buffered_data, the data_ptr could + * point to invalid data + * @param size_to_peek + * @param data_ptr pointer pointing to peeked data + * @param peek_size returns number of bytes peeked by reference + * @return ErrorCode_Success on success + * @return ErrorCode_errno on error + * @return ErrorCode_NotInit if the file is not opened + * @return ErrorCode_EndOfFile if already reaching the eof + */ [[nodiscard]] ErrorCode peek_buffered_data(size_t size_to_peek, const char*& data_ptr, size_t& peek_size); @@ -119,26 +146,66 @@ class BufferedFileReader : public BufferedReaderInterface { * Tries reading a string view of size = read_size from the ir_buf. * @param str_view Returns the string view * @param read_size - * @return true on success, false if the ir_buf doesn't contain enough - * data to decode + * @return true on success, false if the file doesn't contain enough + * data **/ [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, size_t read_size) override; + /** + * returns a const pointer to the internal buffer + */ [[nodiscard]] virtual const char* get_buffer_ptr () override; + /** + * Sets a checkpoint pos. the BufferedFileReader guarantees that + * all data after checkpoint pos will be buffered in the memory and + * support seek. + * @return current file pos + */ size_t mark_pos(); - void revert_pos(); + + /** + * Disable the checkpoint pos and release buffered data from memory + */ void reset_checkpoint (); private: + // Methods [[nodiscard]] size_t cursor_pos() const { return m_file_pos - m_buffer_begin_pos; } [[nodiscard]] char* buffer_head() const { return m_buffer.get() + cursor_pos(); } - [[nodiscard]] size_t remaining_data_size() const; + + /** + * Quantize the given size to be the next integer multiple of buffer_size + * @param size + * @return quantized size + */ [[nodiscard]] size_t quantize_to_buffer_size(size_t size); + + /** + * Reads next refill_size bytes from file descriptor to the internal buffer + * and sets the data size of the internal buffer + * Note: the function returns success even if the number of bytes read from + * the fd is less than the refill_size + * @param refill_size + * @return ErrorCode_Success on success + * @return ErrorCode_errno on error + * @return ErrorCode_NotInit if the file is not opened + * @return ErrorCode_EndOfFile if already reaching the eof + */ [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); + + /** + * Similar to refill_reader_buffer, except that number of bytes refilled + * is returned by reference + * @param refill_size + * @param num_bytes_refilled Returns the number of bytes refilled by + * reference + * @return Same as refill_reader_buffer(size_t refill_size) + */ [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); + // Types size_t m_file_pos; int m_fd; @@ -149,11 +216,12 @@ class BufferedFileReader : public BufferedReaderInterface { size_t m_data_size; size_t m_buffer_begin_pos; - // constant + // constant for buffer related calculation size_t m_buffer_exp; size_t m_buffer_size; size_t m_buffer_aligned_mask; - // checkpoint specific data + + // Variables for checkpoint support bool m_checkpoint_enabled; size_t m_checkpoint_pos; }; diff --git a/components/core/src/BufferedReaderInterface.cpp b/components/core/src/BufferedReaderInterface.cpp index b94be0da0..f64b71ded 100644 --- a/components/core/src/BufferedReaderInterface.cpp +++ b/components/core/src/BufferedReaderInterface.cpp @@ -1,8 +1 @@ - #include "BufferedReaderInterface.hpp" - -// C standard libraries - -// C++ standard libraries - -// Project headers \ No newline at end of file diff --git a/components/core/src/BufferedReaderInterface.hpp b/components/core/src/BufferedReaderInterface.hpp index b0f38f816..ee258cf79 100644 --- a/components/core/src/BufferedReaderInterface.hpp +++ b/components/core/src/BufferedReaderInterface.hpp @@ -1,10 +1,6 @@ #ifndef newReaderInterface_HPP #define newReaderInterface_HPP -// C standard libraries - -// C++ standard libraries - // Project headers #include "ReaderInterface.hpp" diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index ff40deb2f..92c691877 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -186,22 +186,21 @@ ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char } } - // If we don't need to simulate reading '\0' before the start of the data block - // simply return a const& to the current data block if (m_data_block_pos_in_file <= m_pos_in_file) { + // No need to to simulate reading '\0' before the start of the data block + // simply return a const pointer to the current data block peek_size = std::min(size_to_peek, m_data_block_length - m_pos_in_data_block); data_ptr = reinterpret_cast(m_data_block); return ErrorCode_Success; } - // If there are sparse bytes before the data block, the pos in data block + // There are sparse bytes before the data block, so the pos in data block // must be 0 assert(m_pos_in_data_block != 0); - auto num_sparse_bytes = m_data_block_pos_in_file - m_pos_in_file; peek_size = std::min(num_sparse_bytes + m_data_block_length, size_to_peek); - // resize the local buffer is necessary + // resize the local buffer if necessary if (m_data_for_peek.size() < peek_size) { m_data_for_peek.resize(peek_size); } @@ -212,7 +211,7 @@ ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char return ErrorCode_Success; } - // if size to peek is greater than number of sparse bytes, + // Size to peek is greater than number of sparse bytes, // copy over the data from data_block to the peek data buffer memset(m_data_for_peek.data(), '\0', num_sparse_bytes); size_t remaining_bytes_to_peek = peek_size - num_sparse_bytes; diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index 9552dd915..11354fd26 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -72,11 +72,12 @@ class LibarchiveFileReader : public ReaderInterface { ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; /** - * Tries to peek up to a given number of bytes from the file. - * Note: This function only tries to peek within the next data block. + * Tries to peek up to a given number of bytes from the next data block + * if no enough data is available in the next data block, a smaller peek + * size will be returned * @param size_to_peek * @param data_ptr - * @param peek_size + * @param peek_size Return the number of bytes peeked by reference * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success @@ -111,11 +112,13 @@ class LibarchiveFileReader : public ReaderInterface { la_int64_t m_data_block_pos_in_file; const void* m_data_block; size_t m_data_block_length; - std::vector m_data_for_peek; la_int64_t m_pos_in_data_block; bool m_reached_eof; size_t m_pos_in_file; + + // vector to hold peeked data + std::vector m_data_for_peek; }; #endif // LIBARCHIVEFILEREADER_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 195d0ab0e..f8006d750 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -55,8 +55,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, - MyStringView& dict_var); + static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, + encoded_tag_t encoded_tag, MyStringView& dict_var); /** * Parses the next timestamp from ir_buf @@ -72,7 +72,8 @@ namespace ffi::ir_stream { * to decode */ template - IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); + IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + epoch_time_ms_t& ts); /** * Decodes the next encoded message from ir_buf @@ -90,7 +91,8 @@ namespace ffi::ir_stream { * to decode */ template - static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, + string& message, epoch_time_ms_t& timestamp); /** @@ -104,8 +106,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, - uint16_t& metadata_size); + static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, + encoded_tag_t& metadata_type, uint16_t& metadata_size); /** * Decodes the message from the given logtype, encoded variables, and @@ -206,8 +208,8 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, - MyStringView& dict_var) { + static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, + encoded_tag_t encoded_tag, MyStringView& dict_var) { // Decode variable's length size_t var_length; if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { From 487bc49263b35241e1d921c17d69ef221e82575e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 31 May 2023 00:03:21 -0400 Subject: [PATCH 037/121] Temporary Fix to disable string_view optimization --- components/core/src/BufferReader.cpp | 17 --- components/core/src/BufferReader.hpp | 21 +--- components/core/src/BufferedFileReader.cpp | 24 ---- components/core/src/BufferedFileReader.hpp | 19 +--- .../core/src/BufferedReaderInterface.cpp | 1 - .../core/src/BufferedReaderInterface.hpp | 24 ---- .../src/ffi/ir_stream/decoding_methods.cpp | 107 ++++++++++-------- .../src/ffi/ir_stream/decoding_methods.hpp | 23 +++- .../core/tests/test-ir_encoding_methods.cpp | 17 ++- 9 files changed, 93 insertions(+), 160 deletions(-) delete mode 100644 components/core/src/BufferedReaderInterface.cpp delete mode 100644 components/core/src/BufferedReaderInterface.hpp diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 54216a107..a94e97395 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -40,20 +40,3 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n pos = m_cursor_pos; return ErrorCode_Success; } - -bool BufferReader::try_read_string_view (MyStringView& str_view, size_t read_size) { - if (nullptr == m_data) { - return ErrorCode_NotInit; - } - if ((m_cursor_pos + read_size) > m_size) { - return false; - } - str_view.m_buffer_pos = m_cursor_pos; - str_view.m_size = read_size; - m_cursor_pos += read_size; - return true; -} - -const char* BufferReader::get_buffer_ptr () { - return m_data; -} diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 9d8af956d..17dbc928a 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -2,9 +2,9 @@ #define BufferReader_HPP // Project headers -#include "BufferedReaderInterface.hpp" +#include "ReaderInterface.hpp" -class BufferReader : public BufferedReaderInterface { +class BufferReader : public ReaderInterface { public: // Types class OperationFailed : public TraceableException { @@ -52,17 +52,6 @@ class BufferReader : public BufferedReaderInterface { */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; - // Methods - /** - * Tries reading a string view of size = read_size from the buffered data - * @param str_view Returns the string view - * @param read_size - * @return true on success, false if the BufferReader doesn't contain - * enough data - **/ - [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, - size_t read_size) override; - /** * Lets the BufferReader points to data with given size. * @param data @@ -74,12 +63,6 @@ class BufferReader : public BufferedReaderInterface { m_cursor_pos = 0; } - /** - * Returns a pointer pointing to the data - * @return - */ - [[nodiscard]] virtual const char* get_buffer_ptr () override; - private: const char* m_data; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index c4cc9efba..78f0ac5a4 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -354,30 +354,6 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, return ErrorCode_Success; } -bool BufferedFileReader::try_read_string_view (MyStringView& str_view, size_t read_size) { - if (-1 == m_fd) { - return false; - } - if (false == m_checkpoint_enabled) { - SPDLOG_ERROR("Can't read string view when checkpoint is not enabled"); - throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); - } - str_view.m_buffer_pos = cursor_pos(); - // try to seek to the string view pos. as to pretend that the string has - // been read passed - if (auto error_code = try_seek_from_begin(m_file_pos + read_size); - error_code != ErrorCode_Success) { - SPDLOG_ERROR("Unexpected error happened"); - throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); - } - str_view.m_size = read_size; - return true; -} - -const char* BufferedFileReader::get_buffer_ptr () { - return m_buffer.get(); -} - static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 1da83cf3d..691ace381 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -11,11 +11,11 @@ // Project headers #include "Defs.h" #include "ErrorCode.hpp" -#include "BufferedReaderInterface.hpp" +#include "ReaderInterface.hpp" #include "TraceableException.hpp" -class BufferedFileReader : public BufferedReaderInterface { +class BufferedFileReader : public ReaderInterface { public: static constexpr size_t cDefaultBufferSize = 65536; // Types @@ -142,21 +142,6 @@ class BufferedFileReader : public BufferedReaderInterface { [[nodiscard]] ErrorCode peek_buffered_data(size_t size_to_peek, const char*& data_ptr, size_t& peek_size); - /** - * Tries reading a string view of size = read_size from the ir_buf. - * @param str_view Returns the string view - * @param read_size - * @return true on success, false if the file doesn't contain enough - * data - **/ - [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, - size_t read_size) override; - - /** - * returns a const pointer to the internal buffer - */ - [[nodiscard]] virtual const char* get_buffer_ptr () override; - /** * Sets a checkpoint pos. the BufferedFileReader guarantees that * all data after checkpoint pos will be buffered in the memory and diff --git a/components/core/src/BufferedReaderInterface.cpp b/components/core/src/BufferedReaderInterface.cpp deleted file mode 100644 index f64b71ded..000000000 --- a/components/core/src/BufferedReaderInterface.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "BufferedReaderInterface.hpp" diff --git a/components/core/src/BufferedReaderInterface.hpp b/components/core/src/BufferedReaderInterface.hpp deleted file mode 100644 index ee258cf79..000000000 --- a/components/core/src/BufferedReaderInterface.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef newReaderInterface_HPP -#define newReaderInterface_HPP - -// Project headers -#include "ReaderInterface.hpp" - -class BufferedReaderInterface : public ReaderInterface { -public: - // Types - class MyStringView { - public: - MyStringView(): m_buffer_pos(0), m_size(0) {} - MyStringView(size_t pos, size_t size) : - m_buffer_pos(pos), m_size(size) {} - // variable - size_t m_buffer_pos; - size_t m_size; - }; - [[nodiscard]] virtual bool try_read_string_view (MyStringView& str_view, size_t read_size) = 0; - [[nodiscard]] virtual const char* get_buffer_ptr () = 0; -}; - - -#endif // newReaderInterface_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index f8006d750..83bff5184 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -6,9 +6,7 @@ using std::is_same_v; using std::string; -using std::string_view; using std::vector; -using MyStringView = BufferedReaderInterface::MyStringView; namespace ffi::ir_stream { /** @@ -30,7 +28,7 @@ namespace ffi::ir_stream { * to decode */ template - static bool decode_int (BufferedReaderInterface& ir_buf, integer_t& value); + static bool decode_int (ReaderInterface& ir_buf, integer_t& value); /** * Decodes the next logtype string from ir_buf @@ -42,8 +40,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode parse_logtype (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, - MyStringView& logtype); + static IRErrorCode parse_logtype (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, + string& logtype); /** * Decodes the next dictionary-type variable string from ir_buf @@ -55,8 +53,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, - encoded_tag_t encoded_tag, MyStringView& dict_var); + static IRErrorCode parse_dictionary_var (ReaderInterface& ir_buf, + encoded_tag_t encoded_tag, string& dict_var); /** * Parses the next timestamp from ir_buf @@ -72,7 +70,7 @@ namespace ffi::ir_stream { * to decode */ template - IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, + IRErrorCode parse_timestamp (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); /** @@ -91,7 +89,7 @@ namespace ffi::ir_stream { * to decode */ template - static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, + static IRErrorCode generic_decode_next_message (ReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp); @@ -106,7 +104,7 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data * to decode */ - static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, + static IRErrorCode read_metadata_info (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size); /** @@ -123,10 +121,9 @@ namespace ffi::ir_stream { */ template static string decode_message ( - MyStringView& logtype, - const char* buffer_begin_ptr, + const string& logtype, const vector& encoded_vars, - const vector& dictionary_vars + const vector& dictionary_vars ); template @@ -156,7 +153,7 @@ namespace ffi::ir_stream { } template - static bool decode_int (BufferedReaderInterface& ir_buf, integer_t& value) { + static bool decode_int (ReaderInterface& ir_buf, integer_t& value) { integer_t value_small_endian; if (ir_buf.try_read_numeric_value(value_small_endian) != ErrorCode_Success) { return false; @@ -176,8 +173,8 @@ namespace ffi::ir_stream { return true; } - static IRErrorCode parse_logtype (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, - MyStringView& logtype) + static IRErrorCode parse_logtype (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, + string& logtype) { size_t logtype_length; if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { @@ -202,14 +199,14 @@ namespace ffi::ir_stream { return IRErrorCode_Corrupted_IR; } - if (ir_buf.try_read_string_view(logtype, logtype_length) == false) { + if (ErrorCode_Success != ir_buf.try_read_string(logtype_length, logtype)) { return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; } - static IRErrorCode parse_dictionary_var (BufferedReaderInterface& ir_buf, - encoded_tag_t encoded_tag, MyStringView& dict_var) { + static IRErrorCode parse_dictionary_var (ReaderInterface& ir_buf, + encoded_tag_t encoded_tag, string& dict_var) { // Decode variable's length size_t var_length; if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { @@ -235,7 +232,7 @@ namespace ffi::ir_stream { } // Read the dictionary variable - if (false == ir_buf.try_read_string_view(dict_var, var_length)) { + if (ErrorCode_Success != ir_buf.try_read_string(var_length, dict_var)) { return IRErrorCode_Incomplete_IR; } @@ -243,7 +240,7 @@ namespace ffi::ir_stream { } template - IRErrorCode parse_timestamp (BufferedReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) + IRErrorCode parse_timestamp (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { static_assert(is_same_v || is_same_v); @@ -282,7 +279,7 @@ namespace ffi::ir_stream { } template - static IRErrorCode generic_decode_next_message (BufferedReaderInterface& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (ReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp) { encoded_tag_t encoded_tag; @@ -295,9 +292,9 @@ namespace ffi::ir_stream { // Handle variables vector encoded_vars; - vector dict_vars; + vector dict_vars; encoded_variable_t encoded_variable; - MyStringView var_str; + string var_str; bool is_encoded_var; while (is_variable_tag(encoded_tag, is_encoded_var)) { if (is_encoded_var) { @@ -319,7 +316,7 @@ namespace ffi::ir_stream { } // Handle logtype - MyStringView logtype; + string logtype; if (auto error_code = parse_logtype(ir_buf, encoded_tag, logtype); IRErrorCode_Success != error_code) { @@ -338,15 +335,14 @@ namespace ffi::ir_stream { } try { - auto buffer_begin_ptr = ir_buf.get_buffer_ptr(); - message = decode_message(logtype, buffer_begin_ptr, encoded_vars, dict_vars); + message = decode_message(logtype, encoded_vars, dict_vars); } catch (const EncodingException& e) { return IRErrorCode_Decode_Error; } return IRErrorCode_Success; } - static IRErrorCode read_metadata_info (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, uint16_t& metadata_size) { if (ErrorCode_Success != ir_buf.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; @@ -380,10 +376,9 @@ namespace ffi::ir_stream { template static string decode_message ( - MyStringView& logtype, - const char* buffer_begin_ptr, + const string& logtype, const vector& encoded_vars, - const vector& dictionary_vars + const vector& dictionary_vars ) { string message; size_t encoded_vars_length = encoded_vars.size(); @@ -392,12 +387,11 @@ namespace ffi::ir_stream { size_t dictionary_vars_ix = 0; size_t encoded_vars_ix = 0; - auto logtype_ptr = buffer_begin_ptr + logtype.m_buffer_pos; - for (size_t cur_pos = 0; cur_pos < logtype.m_size; ++cur_pos) { - auto c = logtype_ptr[cur_pos]; + for (size_t cur_pos = 0; cur_pos < logtype.size(); ++cur_pos) { + auto c = logtype[cur_pos]; switch(c) { case enum_to_underlying_type(VariablePlaceholder::Float): { - message.append(logtype_ptr + next_static_text_begin_pos, + message.append(logtype, next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (encoded_vars_ix >= encoded_vars_length) { @@ -411,7 +405,7 @@ namespace ffi::ir_stream { } case enum_to_underlying_type(VariablePlaceholder::Integer): { - message.append(logtype_ptr + next_static_text_begin_pos, + message.append(logtype, next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (encoded_vars_ix >= encoded_vars_length) { @@ -425,16 +419,14 @@ namespace ffi::ir_stream { } case enum_to_underlying_type(VariablePlaceholder::Dictionary): { - message.append(logtype_ptr + next_static_text_begin_pos, + message.append(logtype, next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); next_static_text_begin_pos = cur_pos + 1; if (dictionary_vars_ix >= dict_vars_length) { throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, cTooFewDictionaryVarsErrorMessage); } - auto offset = dictionary_vars[dictionary_vars_ix].m_buffer_pos; - auto size = dictionary_vars[dictionary_vars_ix].m_size; - message.append(buffer_begin_ptr + offset, size); + message.append(dictionary_vars[dictionary_vars_ix]); ++dictionary_vars_ix; break; @@ -443,11 +435,11 @@ namespace ffi::ir_stream { case cVariablePlaceholderEscapeCharacter: { // Ensure the escape character is followed by a // character that's being escaped - if (cur_pos == logtype.m_size - 1) { + if (cur_pos == logtype.size() - 1) { throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, cUnexpectedEscapeCharacterMessage); } - message.append(logtype_ptr + next_static_text_begin_pos, + message.append(logtype, next_static_text_begin_pos, cur_pos - next_static_text_begin_pos); // Skip the escape character @@ -464,15 +456,15 @@ namespace ffi::ir_stream { } } // Add remainder - if (next_static_text_begin_pos < logtype.m_size) { - message.append(logtype_ptr + next_static_text_begin_pos, - logtype.m_size - next_static_text_begin_pos); + if (next_static_text_begin_pos < logtype.size()) { + message.append(logtype, next_static_text_begin_pos, + logtype.size() - next_static_text_begin_pos); } return message; } - IRErrorCode get_encoding_type (BufferedReaderInterface& ir_buf, bool& is_four_bytes_encoding) { + IRErrorCode get_encoding_type (ReaderInterface& ir_buf, bool& is_four_bytes_encoding) { char buffer[cProtocol::MagicNumberLength]; size_t num_bytes_read; auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); @@ -491,7 +483,7 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - IRErrorCode decode_preamble (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); @@ -506,8 +498,25 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } + IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, + std::vector& metadata) + { + uint16_t metadata_size; + if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); + error_code != IRErrorCode_Success) { + return error_code; + } + + metadata.resize(metadata_size); + if (ErrorCode_Success != ir_buf.try_read_exact_length( + reinterpret_cast(metadata.data()), metadata_size)) { + return IRErrorCode_Incomplete_IR; + } + return IRErrorCode_Success; + } + namespace four_byte_encoding { - IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, string& message, + IRErrorCode decode_next_message (ReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp_delta) { return generic_decode_next_message( @@ -517,7 +526,7 @@ namespace ffi::ir_stream { } namespace eight_byte_encoding { - IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, string& message, + IRErrorCode decode_next_message (ReaderInterface& ir_buf, string& message, epoch_time_ms_t& timestamp) { return generic_decode_next_message( diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 7ebf7df34..0f8411e12 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -7,7 +7,7 @@ // Project headers #include "../encoding_methods.hpp" -#include "../../BufferedReaderInterface.hpp" +#include "../../ReaderInterface.hpp" namespace ffi::ir_stream { using encoded_tag_t = uint8_t; @@ -28,7 +28,7 @@ namespace ffi::ir_stream { * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to * decode */ - IRErrorCode get_encoding_type (BufferedReaderInterface& ir_buf, bool& is_four_bytes_encoding); + IRErrorCode get_encoding_type (ReaderInterface& ir_buf, bool& is_four_bytes_encoding); /** * Decodes the preamble for an IR stream. @@ -41,9 +41,22 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough * data to decode */ - IRErrorCode decode_preamble (BufferedReaderInterface& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size); + /** + * Decodes the preamble for an IR stream. + * @param ir_buf + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata Returns the metadata as a vector by reference + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough + * data to decode + */ + IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, + std::vector& metadata); + namespace eight_byte_encoding { /** * Decodes the next message for the eight-byte encoding IR stream. @@ -58,7 +71,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, std::string& message, + IRErrorCode decode_next_message (ReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp); } @@ -76,7 +89,7 @@ namespace ffi::ir_stream { * to decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message (BufferedReaderInterface& ir_buf, std::string& message, + IRErrorCode decode_next_message (ReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp_delta); } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 51ca28c77..44e366d25 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -279,10 +279,19 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); - auto json_metadata_ptr = ir_buffer.get_buffer_ptr() + metadata_pos; - string_view json_metadata {json_metadata_ptr, metadata_size}; + auto json_metadata_ptr = reinterpret_cast(ir_buf.data() + metadata_pos); + string_view json_metadata_ref {json_metadata_ptr, metadata_size}; - auto metadata_json = nlohmann::json::parse(json_metadata); + // Test if preamble can be decoded by the string copy method + std::vector json_metadata_vec; + ir_buffer.seek_from_begin(MagicNumberLength); + REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) == + IRErrorCode::IRErrorCode_Success); + string_view json_metadata_copied {reinterpret_cast(json_metadata_vec.data()), + json_metadata_vec.size()}; + REQUIRE (json_metadata_copied == json_metadata_ref); + + auto metadata_json = nlohmann::json::parse(json_metadata_ref); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); REQUIRE(ffi::ir_stream::cProtocol::Metadata::EncodingJson == metadata_type); @@ -471,7 +480,7 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == complete_encoding_buffer.get_pos()); - auto json_metadata_ptr = complete_encoding_buffer.get_buffer_ptr() + metadata_pos; + auto json_metadata_ptr = reinterpret_cast(ir_buf.data() + metadata_pos); string_view json_metadata {json_metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == From be7951a478bde3e6f4b8fa51561f30f2ba600a56 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 31 May 2023 12:19:48 -0400 Subject: [PATCH 038/121] Fix for code review --- components/core/src/BufferReader.cpp | 19 +- components/core/src/BufferReader.hpp | 25 +-- components/core/src/BufferedFileReader.cpp | 9 +- components/core/src/BufferedFileReader.hpp | 13 +- components/core/src/LibarchiveFileReader.cpp | 2 +- .../src/ffi/ir_stream/decoding_methods.cpp | 170 +++++++++--------- .../src/ffi/ir_stream/decoding_methods.hpp | 7 +- .../src/ffi/ir_stream/encoding_methods.hpp | 24 +-- .../core/tests/test-BufferedFileReader.cpp | 2 +- .../core/tests/test-ir_encoding_methods.cpp | 40 ++--- 10 files changed, 162 insertions(+), 149 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index a94e97395..f0e7af9fb 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -1,23 +1,28 @@ #include "BufferReader.hpp" // C++ standard libraries +#include #include -// Project headers -#include "spdlog/spdlog.h" - using std::string_view; ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (nullptr == m_data) { return ErrorCode_NotInit; } - if (nullptr == buf) { + if (nullptr == buf && num_bytes_to_read > 0) { return ErrorCode_BadParam; } - num_bytes_read = std::min(m_size - m_cursor_pos, num_bytes_to_read); - memcpy(buf, m_data + m_cursor_pos, num_bytes_read); + auto remaining_data_size = m_data_size - m_cursor_pos; + if (remaining_data_size == 0) { + return ErrorCode_EndOfFile; + } + + num_bytes_read = std::min(remaining_data_size, num_bytes_to_read); + auto copy_begin = m_data + m_cursor_pos; + auto copy_end = copy_begin + num_bytes_read; + std::copy(copy_begin, copy_end, buf); m_cursor_pos += num_bytes_read; return ErrorCode_Success; } @@ -26,7 +31,7 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n if (nullptr == m_data) { return ErrorCode_NotInit; } - if (pos > m_size) { + if (pos > m_data_size) { return ErrorCode_OutOfBounds; } m_cursor_pos = pos; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 17dbc928a..9e4546007 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -1,9 +1,12 @@ -#ifndef BufferReader_HPP -#define BufferReader_HPP +#ifndef BUFFERREADER_HPP +#define BUFFERREADER_HPP // Project headers #include "ReaderInterface.hpp" +/** + * Class that represents a ReaderInterface to a fixed size buffer + */ class BufferReader : public ReaderInterface { public: // Types @@ -20,8 +23,8 @@ class BufferReader : public ReaderInterface { }; // Constructors - BufferReader () : m_data(nullptr), m_size(0), m_cursor_pos(0) {} - BufferReader (const char* data, size_t size) : m_data(data), m_size(size), m_cursor_pos(0) {} + BufferReader () : m_data(nullptr), m_data_size(0), m_cursor_pos(0) {} + BufferReader (const char* data, size_t size) : m_data(data), m_data_size(size), m_cursor_pos(0) {} // Methods implementing the ReaderInterface /** @@ -46,29 +49,27 @@ class BufferReader : public ReaderInterface { [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; /** * Tries to get the current position of the read head in the buffer - * @param pos Position of the read head in the buffer + * @param pos Returns the position of the read head in the buffer * @return ErrorCode_NotInit if the buffer is not initialized * @return ErrorCode_Success on success */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; /** - * Lets the BufferReader points to data with given size. + * Sets the underlying buffer for this reader. * @param data - * @param size + * @param data_size **/ - void reset_buffer (const char* data, size_t size) { + void set_buffer (const char* data, size_t data_size) { m_data = data; - m_size = size; + m_data_size = data_size; m_cursor_pos = 0; } - private: const char* m_data; - size_t m_size; + size_t m_data_size; size_t m_cursor_pos; }; - #endif // BufferReader_HPP diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 78f0ac5a4..aa7d6a558 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -274,14 +274,15 @@ ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { SPDLOG_ERROR("Buffer size {} is not a multiple of page size", buffer_size); return ErrorCode_BadParam; } - // Calculate the logarithm base 2 of the number - double exponent = log(buffer_size) / log(2); - if (ceil(exponent) != floor(exponent)) { + // fast calculation to check if buffer_size is a power of 2 leveraged + // from https://stackoverflow.com/questions/51094594/ + // how-to-check-if-exactly-one-bit-is-set-in-an-int + if (false == (!(buffer_size & (buffer_size-1)))) { SPDLOG_ERROR("Buffer size {} is not a power of 2", buffer_size); return ErrorCode_BadParam; } - m_buffer_exp = static_cast(exponent); + m_buffer_exp = static_cast(log2(static_cast(buffer_size))); m_buffer_size = buffer_size; m_buffer_aligned_mask = ~(m_buffer_size - 1); return ErrorCode_Success; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 691ace381..358ae2b9d 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -14,10 +14,11 @@ #include "ReaderInterface.hpp" #include "TraceableException.hpp" - +/** + * Class that represents a on-disk-file with customized buffering + */ class BufferedFileReader : public ReaderInterface { public: - static constexpr size_t cDefaultBufferSize = 65536; // Types class OperationFailed : public TraceableException { public: @@ -34,6 +35,7 @@ class BufferedFileReader : public ReaderInterface { // Constructors BufferedFileReader(); ~BufferedFileReader(); + // Methods implementing the ReaderInterface /** * Tries to get the current position of the read head in the file @@ -191,7 +193,10 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); - // Types + // Constants + static constexpr size_t cDefaultBufferSize = 65536; + + // Variables size_t m_file_pos; int m_fd; std::string m_path; @@ -201,7 +206,7 @@ class BufferedFileReader : public ReaderInterface { size_t m_data_size; size_t m_buffer_begin_pos; - // constant for buffer related calculation + // Values for buffer related calculation size_t m_buffer_exp; size_t m_buffer_size; size_t m_buffer_aligned_mask; diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 92c691877..7158f15d5 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -266,4 +266,4 @@ ErrorCode LibarchiveFileReader::read_next_data_block () { m_pos_in_data_block = 0; return ErrorCode_Success; -} \ No newline at end of file +} diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 83bff5184..34ba7716a 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -3,6 +3,7 @@ // Project headers #include "byteswap.hpp" #include "protocol_constants.hpp" +#include "../../type_utils.hpp" using std::is_same_v; using std::string; @@ -20,91 +21,91 @@ namespace ffi::ir_stream { static bool is_variable_tag (encoded_tag_t tag, bool& is_encoded_var); /** - * Decodes an integer from ir_buf + * Decodes an integer from reader * @tparam integer_t Type of the integer to decode - * @param ir_buf + * @param reader * @param value Returns the decoded integer - * @return true on success, false if the ir_buf doesn't contain enough data + * @return true on success, false if the reader doesn't contain enough data * to decode */ template - static bool decode_int (ReaderInterface& ir_buf, integer_t& value); + static bool decode_int (ReaderInterface& reader, integer_t& value); /** - * Decodes the next logtype string from ir_buf - * @param ir_buf + * Decodes the next logtype string from reader + * @param reader * @param encoded_tag * @param logtype Returns the logtype string * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ - static IRErrorCode parse_logtype (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_logtype (ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype); /** - * Decodes the next dictionary-type variable string from ir_buf - * @param ir_buf + * Decodes the next dictionary-type variable string from reader + * @param reader * @param encoded_tag * @param dict_var Returns the dictionary variable * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (ReaderInterface& ir_buf, + static IRErrorCode parse_dictionary_var (ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var); /** - * Parses the next timestamp from ir_buf + * Parses the next timestamp from reader * @tparam encoded_variable_t Type of the encoded variable - * @param ir_buf + * @param reader * @param encoded_tag * @param ts Returns the timestamp delta if * encoded_variable_t == four_byte_encoded_variable_t or the actual * timestamp if encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ template - IRErrorCode parse_timestamp (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, + IRErrorCode parse_timestamp (ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); /** - * Decodes the next encoded message from ir_buf + * Decodes the next encoded message from reader * @tparam encoded_variable_t Type of the encoded variable - * @param ir_buf + * @param reader * @param message Returns the decoded message * @param timestamp Returns the timestamp delta if * encoded_variable_t == four_byte_encoded_variable_t or the actual * timestamp if encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR * @return IRErrorCode_Decode_Error if the encoded message cannot be * properly decoded - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ template - static IRErrorCode generic_decode_next_message (ReaderInterface& ir_buf, + static IRErrorCode generic_decode_next_message (ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp); /** - * Reads metadata information from the ir_buf - * @param ir_buf + * Reads metadata information from the reader + * @param reader * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in ir_buf + * @param metadata_pos Returns the starting position of the metadata in reader * @param metadata_size Returns the size of the metadata written in the IR * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ - static IRErrorCode read_metadata_info (ReaderInterface& ir_buf, + static IRErrorCode read_metadata_info (ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size); /** @@ -153,45 +154,45 @@ namespace ffi::ir_stream { } template - static bool decode_int (ReaderInterface& ir_buf, integer_t& value) { - integer_t value_small_endian; - if (ir_buf.try_read_numeric_value(value_small_endian) != ErrorCode_Success) { + static bool decode_int (ReaderInterface& reader, integer_t& value) { + integer_t value_little_endian; + if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { return false; } constexpr auto read_size = sizeof(integer_t); static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); if constexpr (read_size == 1) { - value = value_small_endian; + value = value_little_endian; } else if constexpr (read_size == 2) { - value = bswap_16(value_small_endian); + value = bswap_16(value_little_endian); } else if constexpr (read_size == 4) { - value = bswap_32(value_small_endian); + value = bswap_32(value_little_endian); } else if constexpr (read_size == 8) { - value = bswap_64(value_small_endian); + value = bswap_64(value_little_endian); } return true; } - static IRErrorCode parse_logtype (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, + static IRErrorCode parse_logtype (ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { size_t logtype_length; if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { uint8_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } logtype_length = length; } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { uint16_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } logtype_length = length; } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { int32_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } logtype_length = length; @@ -199,31 +200,31 @@ namespace ffi::ir_stream { return IRErrorCode_Corrupted_IR; } - if (ErrorCode_Success != ir_buf.try_read_string(logtype_length, logtype)) { + if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; } - static IRErrorCode parse_dictionary_var (ReaderInterface& ir_buf, + static IRErrorCode parse_dictionary_var (ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { // Decode variable's length size_t var_length; if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { uint8_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } var_length = length; } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { uint16_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } var_length = length; } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { int32_t length; - if (false == decode_int(ir_buf, length)) { + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } var_length = length; @@ -232,7 +233,7 @@ namespace ffi::ir_stream { } // Read the dictionary variable - if (ErrorCode_Success != ir_buf.try_read_string(var_length, dict_var)) { + if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { return IRErrorCode_Incomplete_IR; } @@ -240,7 +241,7 @@ namespace ffi::ir_stream { } template - IRErrorCode parse_timestamp (ReaderInterface& ir_buf, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) + IRErrorCode parse_timestamp (ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { static_assert(is_same_v || is_same_v); @@ -249,25 +250,25 @@ namespace ffi::ir_stream { if (cProtocol::Payload::TimestampVal != encoded_tag) { return IRErrorCode_Corrupted_IR; } - if (false == decode_int(ir_buf, ts)) { + if (false == decode_int(reader, ts)) { return IRErrorCode_Incomplete_IR; } } else { if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { int8_t ts_delta; - if (false == decode_int(ir_buf, ts_delta)) { + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } ts = ts_delta; } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { int16_t ts_delta; - if (false == decode_int(ir_buf, ts_delta)) { + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } ts = ts_delta; } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { int32_t ts_delta; - if (false == decode_int(ir_buf, ts_delta)) { + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } ts = ts_delta; @@ -279,11 +280,11 @@ namespace ffi::ir_stream { } template - static IRErrorCode generic_decode_next_message (ReaderInterface& ir_buf, string& message, + static IRErrorCode generic_decode_next_message (ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { encoded_tag_t encoded_tag; - if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } if (cProtocol::Eof == encoded_tag) { @@ -298,26 +299,26 @@ namespace ffi::ir_stream { bool is_encoded_var; while (is_variable_tag(encoded_tag, is_encoded_var)) { if (is_encoded_var) { - if (false == decode_int(ir_buf, encoded_variable)) { + if (false == decode_int(reader, encoded_variable)) { return IRErrorCode_Incomplete_IR; } encoded_vars.push_back(encoded_variable); } else { - if (auto error_code = parse_dictionary_var(ir_buf, encoded_tag, var_str); + if (auto error_code = parse_dictionary_var(reader, encoded_tag, var_str); IRErrorCode_Success != error_code) { return error_code; } dict_vars.emplace_back(var_str); } - if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } } // Handle logtype string logtype; - if (auto error_code = parse_logtype(ir_buf, encoded_tag, logtype); + if (auto error_code = parse_logtype(reader, encoded_tag, logtype); IRErrorCode_Success != error_code) { return error_code; @@ -326,10 +327,10 @@ namespace ffi::ir_stream { // NOTE: for the eight-byte encoding, the timestamp is the actual // timestamp; for the four-byte encoding, the timestamp is a timestamp // delta - if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } - if (auto error_code = parse_timestamp(ir_buf, encoded_tag, timestamp); + if (auto error_code = parse_timestamp(reader, encoded_tag, timestamp); IRErrorCode_Success != error_code) { return error_code; } @@ -342,28 +343,28 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - static IRErrorCode read_metadata_info (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, + static IRErrorCode read_metadata_info (ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size) { - if (ErrorCode_Success != ir_buf.try_read_numeric_value(metadata_type)) { + if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; } // Read metadata length encoded_tag_t encoded_tag; - if (ErrorCode_Success != ir_buf.try_read_numeric_value(encoded_tag)) { + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } switch (encoded_tag) { case cProtocol::Metadata::LengthUByte: uint8_t ubyte_res; - if (false == decode_int(ir_buf, ubyte_res)) { + if (false == decode_int(reader, ubyte_res)) { return IRErrorCode_Incomplete_IR; } metadata_size = ubyte_res; break; case cProtocol::Metadata::LengthUShort: uint16_t ushort_res; - if (false == decode_int(ir_buf, ushort_res)) { + if (false == decode_int(reader, ushort_res)) { return IRErrorCode_Incomplete_IR; } metadata_size = ushort_res; @@ -387,7 +388,7 @@ namespace ffi::ir_stream { size_t dictionary_vars_ix = 0; size_t encoded_vars_ix = 0; - for (size_t cur_pos = 0; cur_pos < logtype.size(); ++cur_pos) { + for (size_t cur_pos = 0; cur_pos < logtype.length(); ++cur_pos) { auto c = logtype[cur_pos]; switch(c) { case enum_to_underlying_type(VariablePlaceholder::Float): { @@ -435,7 +436,7 @@ namespace ffi::ir_stream { case cVariablePlaceholderEscapeCharacter: { // Ensure the escape character is followed by a // character that's being escaped - if (cur_pos == logtype.size() - 1) { + if (cur_pos == logtype.length() - 1) { throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, cUnexpectedEscapeCharacterMessage); } @@ -456,19 +457,18 @@ namespace ffi::ir_stream { } } // Add remainder - if (next_static_text_begin_pos < logtype.size()) { + if (next_static_text_begin_pos < logtype.length()) { message.append(logtype, next_static_text_begin_pos, - logtype.size() - next_static_text_begin_pos); + logtype.length() - next_static_text_begin_pos); } return message; } - IRErrorCode get_encoding_type (ReaderInterface& ir_buf, bool& is_four_bytes_encoding) { + IRErrorCode get_encoding_type (ReaderInterface& reader, bool& is_four_bytes_encoding) { char buffer[cProtocol::MagicNumberLength]; - size_t num_bytes_read; - auto error_code = ir_buf.try_read(buffer, cProtocol::MagicNumberLength, num_bytes_read); - if (error_code != ErrorCode_Success || num_bytes_read != cProtocol::MagicNumberLength) { + auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); + if (error_code != ErrorCode_Success) { return IRErrorCode_Incomplete_IR; } if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, @@ -483,54 +483,54 @@ namespace ffi::ir_stream { return IRErrorCode_Success; } - IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, + IRErrorCode decode_preamble (ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size) { - if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); + if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { return error_code; } - metadata_pos = ir_buf.get_pos(); + metadata_pos = reader.get_pos(); //TODO: this might not be optimal - if (ErrorCode_Success != ir_buf.try_seek_from_begin(metadata_pos + metadata_size)) { + if (ErrorCode_Success != reader.try_seek_from_begin(metadata_pos + metadata_size)) { return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; } - IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, - std::vector& metadata) + IRErrorCode decode_preamble (ReaderInterface& reader, encoded_tag_t& metadata_type, + std::vector& metadata) { uint16_t metadata_size; - if (auto error_code = read_metadata_info(ir_buf, metadata_type, metadata_size); + if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { return error_code; } metadata.resize(metadata_size); - if (ErrorCode_Success != ir_buf.try_read_exact_length( - reinterpret_cast(metadata.data()), metadata_size)) { + if (ErrorCode_Success != reader.try_read_exact_length( + size_checked_pointer_cast(metadata.data()), metadata_size)) { return IRErrorCode_Incomplete_IR; } return IRErrorCode_Success; } namespace four_byte_encoding { - IRErrorCode decode_next_message (ReaderInterface& ir_buf, string& message, + IRErrorCode decode_next_message (ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp_delta) { return generic_decode_next_message( - ir_buf, message, timestamp_delta + reader, message, timestamp_delta ); } } namespace eight_byte_encoding { - IRErrorCode decode_next_message (ReaderInterface& ir_buf, string& message, + IRErrorCode decode_next_message (ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { return generic_decode_next_message( - ir_buf, message, timestamp + reader, message, timestamp ); } } diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 0f8411e12..318e91e97 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -6,11 +6,12 @@ #include // Project headers -#include "../encoding_methods.hpp" #include "../../ReaderInterface.hpp" +#include "../encoding_methods.hpp" + namespace ffi::ir_stream { + using encoded_tag_t = int8_t; - using encoded_tag_t = uint8_t; typedef enum { IRErrorCode_Success, IRErrorCode_Decode_Error, @@ -55,7 +56,7 @@ namespace ffi::ir_stream { * data to decode */ IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, - std::vector& metadata); + std::vector& metadata); namespace eight_byte_encoding { /** diff --git a/components/core/src/ffi/ir_stream/encoding_methods.hpp b/components/core/src/ffi/ir_stream/encoding_methods.hpp index d022c9e51..d1052d07b 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.hpp @@ -15,23 +15,23 @@ namespace ffi::ir_stream { * @param timestamp_pattern * @param timestamp_pattern_syntax * @param time_zone_id - * @param ir_buf + * @param reader * @return true on success, false otherwise */ bool encode_preamble (std::string_view timestamp_pattern, std::string_view timestamp_pattern_syntax, - std::string_view time_zone_id, std::vector& ir_buf); + std::string_view time_zone_id, std::vector& reader); /** * Encodes the given message into the eight-byte encoding IR stream * @param timestamp * @param message * @param logtype - * @param ir_buf + * @param reader * @return true on success, false otherwise */ bool encode_message (epoch_time_ms_t timestamp, std::string_view message, - std::string& logtype, std::vector& ir_buf); + std::string& logtype, std::vector& reader); } namespace four_byte_encoding { @@ -41,44 +41,44 @@ namespace ffi::ir_stream { * @param timestamp_pattern_syntax * @param time_zone_id * @param reference_timestamp - * @param ir_buf + * @param reader * @return true on success, false otherwise */ bool encode_preamble (std::string_view timestamp_pattern, std::string_view timestamp_pattern_syntax, std::string_view time_zone_id, epoch_time_ms_t reference_timestamp, - std::vector& ir_buf); + std::vector& reader); /** * Encodes the given message into the four-byte encoding IR stream * @param timestamp_delta * @param message * @param logtype - * @param ir_buf + * @param reader * @return true on success, false otherwise */ bool encode_message (epoch_time_ms_t timestamp_delta, std::string_view message, - std::string& logtype, std::vector& ir_buf); + std::string& logtype, std::vector& reader); /** * Encodes the given message into the four-byte encoding IR stream * without encoding timestamp delta * @param message * @param logtype - * @param ir_buf + * @param reader * @return true on success, false otherwise */ bool encode_message (std::string_view message, std::string& logtype, - std::vector& ir_buf); + std::vector& reader); /** * Encodes the given timestamp delta into the four-byte encoding IR * stream * @param timestamp_delta - * @param ir_buf + * @param reader * @return true on success, false otherwise */ - bool encode_timestamp (epoch_time_ms_t timestamp_delta, std::vector& ir_buf); + bool encode_timestamp (epoch_time_ms_t timestamp_delta, std::vector& reader); } } diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index e4668530f..b875bfb5a 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -192,4 +192,4 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { delete[] test_data; delete[] read_buffer; -} \ No newline at end of file +} diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 44e366d25..4b863a5ad 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -212,8 +212,8 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { EightByteEncodingMagicNumber + MagicNumberLength}; // Test eight-byte encoding - ir_buffer.reset_buffer(reinterpret_cast(eight_byte_encoding_vec.data()), - eight_byte_encoding_vec.size()); + ir_buffer.set_buffer(reinterpret_cast(eight_byte_encoding_vec.data()), + eight_byte_encoding_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -222,28 +222,28 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - ir_buffer.reset_buffer(reinterpret_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size()); + ir_buffer.set_buffer(reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test error on empty and incomplete ir_buffer const vector empty_ir_vec; - ir_buffer.reset_buffer(reinterpret_cast(empty_ir_vec.data()), - empty_ir_vec.size()); + ir_buffer.set_buffer(reinterpret_cast(empty_ir_vec.data()), + empty_ir_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); - ir_buffer.reset_buffer(reinterpret_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size() - 1); + ir_buffer.set_buffer(reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() - 1); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - ir_buffer.reset_buffer(reinterpret_cast(invalid_ir_vec.data()), - invalid_ir_vec.size()); + ir_buffer.set_buffer(reinterpret_cast(invalid_ir_vec.data()), + invalid_ir_vec.size()); REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -263,7 +263,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); bool is_four_bytes_encoding; REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); @@ -283,12 +283,12 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode string_view json_metadata_ref {json_metadata_ptr, metadata_size}; // Test if preamble can be decoded by the string copy method - std::vector json_metadata_vec; + std::vector json_metadata_vec; ir_buffer.seek_from_begin(MagicNumberLength); REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) == IRErrorCode::IRErrorCode_Success); - string_view json_metadata_copied {reinterpret_cast(json_metadata_vec.data()), - json_metadata_vec.size()}; + string_view json_metadata_copied { reinterpret_cast(json_metadata_vec.data()), + json_metadata_vec.size() }; REQUIRE (json_metadata_copied == json_metadata_ref); auto metadata_json = nlohmann::json::parse(json_metadata_ref); @@ -343,7 +343,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", const size_t encoded_message_start_pos = 0; // Test if message can be decoded properly - ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); string decoded_message; epoch_time_ms_t timestamp; @@ -360,7 +360,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", // Test incomplete IR ir_buf.resize(encoded_message_end_pos - 4); - ir_buffer.reset_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR == decode_next_message(ir_buffer, message, timestamp)); } @@ -393,8 +393,8 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape {ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - ir_buffer.reset_buffer(reinterpret_cast(ir_with_extra_escape.data()), - ir_with_extra_escape.size()); + ir_buffer.set_buffer(reinterpret_cast(ir_with_extra_escape.data()), + ir_with_extra_escape.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer, decoded_message, timestamp)); @@ -403,8 +403,8 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - ir_buffer.reset_buffer(reinterpret_cast(ir_with_extra_placeholder.data()), - ir_with_extra_placeholder.size()); + ir_buffer.set_buffer(reinterpret_cast(ir_with_extra_placeholder.data()), + ir_with_extra_placeholder.size()); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_buffer, decoded_message, timestamp)); From edc2acd456e7a23fd02114619745f0489bef619b Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 31 May 2023 13:26:20 -0400 Subject: [PATCH 039/121] Missing fixes --- components/core/src/BufferReader.hpp | 4 ++-- components/core/src/BufferedFileReader.hpp | 2 +- components/core/src/ffi/ir_stream/decoding_methods.cpp | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 9e4546007..9f380c813 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -5,7 +5,7 @@ #include "ReaderInterface.hpp" /** - * Class that represents a ReaderInterface to a fixed size buffer + * Class for reading from a fixed size in memory buffer */ class BufferReader : public ReaderInterface { public: @@ -43,7 +43,7 @@ class BufferReader : public ReaderInterface { * Tries to seek from the beginning of the buffer to the given position * @param pos * @return ErrorCode_NotInit if the buffer is not initialized - * @return ErrorCode_OutOfBounds if the given position >= the buffer's size + * @return ErrorCode_OutOfBounds if the given position > the buffer's size * @return ErrorCode_Success on success */ [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 358ae2b9d..f1293be88 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -15,7 +15,7 @@ #include "TraceableException.hpp" /** - * Class that represents a on-disk-file with customized buffering + * Class for reading from a on-disk file with custom buffering */ class BufferedFileReader : public ReaderInterface { public: diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 34ba7716a..ccb90ce09 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -54,8 +54,8 @@ namespace ffi::ir_stream { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - static IRErrorCode parse_dictionary_var (ReaderInterface& reader, - encoded_tag_t encoded_tag, string& dict_var); + static IRErrorCode parse_dictionary_var (ReaderInterface& reader, encoded_tag_t encoded_tag, + string& dict_var); /** * Parses the next timestamp from reader @@ -491,7 +491,6 @@ namespace ffi::ir_stream { return error_code; } metadata_pos = reader.get_pos(); - //TODO: this might not be optimal if (ErrorCode_Success != reader.try_seek_from_begin(metadata_pos + metadata_size)) { return IRErrorCode_Incomplete_IR; } From e32d95e3167ca8f01f4372f9909fe4ebf4841510 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 31 May 2023 14:14:47 -0400 Subject: [PATCH 040/121] Remove redundant constructor --- components/core/src/BufferReader.cpp | 18 ++-- components/core/src/BufferReader.hpp | 3 +- .../core/tests/test-ir_encoding_methods.cpp | 83 ++++++++++--------- 3 files changed, 54 insertions(+), 50 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index f0e7af9fb..78243404f 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -4,12 +4,16 @@ #include #include -using std::string_view; +BufferReader::BufferReader (const char* data, size_t data_size) { + if (data == nullptr || data_size == 0) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_data = data; + m_data_size = data_size; + m_cursor_pos = 0; +} ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (nullptr == m_data) { - return ErrorCode_NotInit; - } if (nullptr == buf && num_bytes_to_read > 0) { return ErrorCode_BadParam; } @@ -28,9 +32,6 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n } [[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { - if (nullptr == m_data) { - return ErrorCode_NotInit; - } if (pos > m_data_size) { return ErrorCode_OutOfBounds; } @@ -39,9 +40,6 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n } [[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { - if (nullptr == m_data) { - return ErrorCode_NotInit; - } pos = m_cursor_pos; return ErrorCode_Success; } diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 9f380c813..49908fbfc 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -23,8 +23,7 @@ class BufferReader : public ReaderInterface { }; // Constructors - BufferReader () : m_data(nullptr), m_data_size(0), m_cursor_pos(0) {} - BufferReader (const char* data, size_t size) : m_data(data), m_data_size(size), m_cursor_pos(0) {} + BufferReader (const char* data, size_t data_size); // Methods implementing the ReaderInterface /** diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 4b863a5ad..4d91afe6d 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -206,15 +206,16 @@ static void set_timestamp_info (const nlohmann::json& metadata_json, TimestampIn TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { bool is_four_bytes_encoding; - BufferReader ir_buffer; // Test eight-byte encoding vector eight_byte_encoding_vec{EightByteEncodingMagicNumber, EightByteEncodingMagicNumber + MagicNumberLength}; // Test eight-byte encoding - ir_buffer.set_buffer(reinterpret_cast(eight_byte_encoding_vec.data()), - eight_byte_encoding_vec.size()); - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == + BufferReader eight_byte_encoding_buffer ( + reinterpret_cast(eight_byte_encoding_vec.data()), + eight_byte_encoding_vec.size() + ); + REQUIRE(get_encoding_type(eight_byte_encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -222,29 +223,30 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - ir_buffer.set_buffer(reinterpret_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size()); - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == + BufferReader four_byte_encoding_buffer ( + reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() + ); + REQUIRE(get_encoding_type(four_byte_encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - // Test error on empty and incomplete ir_buffer - const vector empty_ir_vec; - ir_buffer.set_buffer(reinterpret_cast(empty_ir_vec.data()), - empty_ir_vec.size()); - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Incomplete_IR); + // Test error on incomplete ir_buffer + BufferReader incomplete_buffer ( + reinterpret_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() - 1 + ); - ir_buffer.set_buffer(reinterpret_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size() - 1); - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(incomplete_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - ir_buffer.set_buffer(reinterpret_cast(invalid_ir_vec.data()), - invalid_ir_vec.size()); - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == + BufferReader invalid_ir_buffer ( + reinterpret_cast(invalid_ir_vec.data()), + invalid_ir_vec.size() + ); + REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); } @@ -253,7 +255,6 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode eight_byte_encoded_variable_t) { vector ir_buf; - BufferReader ir_buffer; constexpr char timestamp_pattern[] = "%Y-%m-%d %H:%M:%S,%3"; constexpr char timestamp_pattern_syntax[] = "yyyy-MM-dd HH:mm:ss"; constexpr char time_zone_id[] = "Asia/Tokyo"; @@ -263,29 +264,31 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + BufferReader encoding_buffer ( + reinterpret_cast(ir_buf.data()), ir_buf.size() + ); bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - REQUIRE(MagicNumberLength == ir_buffer.get_pos()); + REQUIRE(MagicNumberLength == encoding_buffer.get_pos()); // Test if preamble can be decoded correctly TimestampInfo ts_info; encoded_tag_t metadata_type{0}; size_t metadata_pos{0}; uint16_t metadata_size{0}; - REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) == + REQUIRE(decode_preamble(encoding_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == encoding_buffer.get_pos()); auto json_metadata_ptr = reinterpret_cast(ir_buf.data() + metadata_pos); string_view json_metadata_ref {json_metadata_ptr, metadata_size}; // Test if preamble can be decoded by the string copy method std::vector json_metadata_vec; - ir_buffer.seek_from_begin(MagicNumberLength); - REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) == + encoding_buffer.seek_from_begin(MagicNumberLength); + REQUIRE(decode_preamble(encoding_buffer, metadata_type, json_metadata_vec) == IRErrorCode::IRErrorCode_Success); string_view json_metadata_copied { reinterpret_cast(json_metadata_vec.data()), json_metadata_vec.size() }; @@ -299,7 +302,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); REQUIRE(time_zone_id == ts_info.time_zone_id); REQUIRE(timestamp_pattern == ts_info.timestamp_pattern); - REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == encoding_buffer.get_pos()); if constexpr (is_same_v) { REQUIRE(reference_ts == @@ -319,7 +322,6 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; - ir_buffer.seek_from_begin(MagicNumberLength); BufferReader corrupted_preamble_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); REQUIRE(decode_preamble( @@ -331,7 +333,6 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", four_byte_encoded_variable_t, eight_byte_encoded_variable_t) { vector ir_buf; - BufferReader ir_buffer; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; @@ -343,7 +344,8 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", const size_t encoded_message_start_pos = 0; // Test if message can be decoded properly - ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + + BufferReader ir_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); string decoded_message; epoch_time_ms_t timestamp; @@ -371,7 +373,6 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", TEST_CASE("message_decode_error", "[ffi][decode_next_message]") { vector ir_buf; - BufferReader ir_buffer; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; @@ -393,20 +394,26 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape {ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - ir_buffer.set_buffer(reinterpret_cast(ir_with_extra_escape.data()), - ir_with_extra_escape.size()); + BufferReader ir_with_extra_escape_buffer ( + reinterpret_cast(ir_with_extra_escape.data()), + ir_with_extra_escape.size() + ); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_buffer, decoded_message, + decode_next_message(ir_with_extra_escape_buffer, + decoded_message, timestamp)); // Test if an extra placeholder triggers a decoder error auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - ir_buffer.set_buffer(reinterpret_cast(ir_with_extra_placeholder.data()), - ir_with_extra_placeholder.size()); + BufferReader ir_with_extra_placeholder_buffer ( + reinterpret_cast(ir_with_extra_placeholder.data()), + ir_with_extra_placeholder.size() + ); REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_buffer, decoded_message, + decode_next_message(ir_with_extra_placeholder_buffer, + decoded_message, timestamp)); } From fa0bb302cf83f25338356b30bf180fe7c838a837 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:54:28 -0400 Subject: [PATCH 041/121] Address code review concern --- components/core/src/BufferedFileReader.cpp | 47 +++++++++++----------- components/core/src/BufferedFileReader.hpp | 4 +- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index aa7d6a558..438ad93fa 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -24,7 +24,7 @@ static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_ BufferedFileReader::BufferedFileReader () { m_file_pos = 0; m_fd = -1; - m_checkpoint_enabled = false; + m_checkpoint_pos.reset(); if (auto error_code = set_buffer_size(cDefaultBufferSize); ErrorCode_Success != error_code) { SPDLOG_ERROR("Failed to init reader buffer size to be {}", cDefaultBufferSize); @@ -54,13 +54,13 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } if (pos <= m_file_pos) { - if (false == m_checkpoint_enabled) { + if (false == m_checkpoint_pos.has_value()) { SPDLOG_ERROR("Error: Seek back when checkpoint is not enabled"); return ErrorCode_Failure; } if (pos < m_checkpoint_pos) { SPDLOG_ERROR("Error: trying to seek to {} which is ahead of checkpoint: {}", - pos, m_checkpoint_pos); + pos, m_checkpoint_pos.value()); return ErrorCode_Failure; } } else { @@ -71,7 +71,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } // Handle the case where buffer doesn't contain enough data for seek - if (false == m_checkpoint_enabled) { + if (false == m_checkpoint_pos.has_value()) { m_buffer_begin_pos = pos & m_buffer_aligned_mask; auto offset = lseek(m_fd, m_buffer_begin_pos, SEEK_SET); if (offset == -1) { @@ -112,14 +112,14 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; // keep reading until enough data is read or an eof is seen while (true) { - auto avaiable_bytes_for_read = std::min(num_bytes_to_read_from_buffer, - remaining_data_size()); - memcpy(buf + num_bytes_read, buffer_head(), avaiable_bytes_for_read); + auto available_bytes_for_read = std::min(num_bytes_to_read_from_buffer, + remaining_data_size()); + memcpy(buf + num_bytes_read, buffer_head(), available_bytes_for_read); - num_bytes_to_read_from_buffer -= avaiable_bytes_for_read; - num_bytes_read += avaiable_bytes_for_read; + num_bytes_to_read_from_buffer -= available_bytes_for_read; + num_bytes_read += available_bytes_for_read; - m_file_pos += avaiable_bytes_for_read; + m_file_pos += available_bytes_for_read; if (num_bytes_to_read_from_buffer == 0) { break; } @@ -152,8 +152,9 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim size_t delim_pos; while (false == found_delim) { // find the pointer pointing to the delimiter - const auto* delim_ptr = reinterpret_cast(memchr(buffer_head(), delim, - remaining_data_size())); + const char* delim_ptr = reinterpret_cast( + memchr(buffer_head(),delim, remaining_data_size()) + ); if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_buffer.get()) + 1; found_delim = true; @@ -162,7 +163,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } // append to strings size_t str_length = delim_pos - cursor_pos(); - str.append(reinterpret_cast(buffer_head()), str_length); + str.append(buffer_head(), str_length); m_file_pos += str_length; if (false == found_delim) { @@ -197,7 +198,8 @@ void BufferedFileReader::open (const string& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - throw "File not found: " + boost::filesystem::weakly_canonical(path).string() + "\n"; + SPDLOG_ERROR("File not found: {}", boost::filesystem::weakly_canonical(path).string()); + throw OperationFailed(error_code, __FILENAME__, __LINE__); } else { throw OperationFailed(error_code, __FILENAME__, __LINE__); } @@ -211,10 +213,10 @@ void BufferedFileReader::close () { ::close(m_fd); m_fd = -1; - if (m_checkpoint_enabled) { + if (m_checkpoint_pos.has_value()) { SPDLOG_DEBUG("close file without resetting checkpoint"); m_buffer = make_unique(m_buffer_size); - m_checkpoint_enabled = false; + m_checkpoint_pos.reset(); } } } @@ -232,18 +234,17 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { } size_t BufferedFileReader::mark_pos() { - if (true == m_checkpoint_enabled) { - SPDLOG_ERROR("I haven't carefully think about whether we should allow this or not"); + if (m_checkpoint_pos.has_value()) { + SPDLOG_ERROR("mark_pos can not be called twice"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_checkpoint_pos = m_file_pos; - m_checkpoint_enabled = true; return m_file_pos; } void BufferedFileReader::reset_checkpoint () { // alternatively, we can keep claiming back the memory - if (false == m_checkpoint_enabled) { + if (false == m_checkpoint_pos.has_value()) { return; } if (m_data_size != m_buffer_size) { @@ -258,7 +259,7 @@ void BufferedFileReader::reset_checkpoint () { m_buffer = std::move(new_buffer); m_buffer_begin_pos += copy_pos; } - m_checkpoint_enabled = false; + m_checkpoint_pos.reset(); } ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { @@ -303,7 +304,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha } } peek_size = std::min(size_to_peek, remaining_data_size()); - data_ptr = reinterpret_cast(buffer_head()); + data_ptr = buffer_head(); return ErrorCode_Success; } @@ -327,7 +328,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { num_bytes_refilled = 0; - if (false == m_checkpoint_enabled) { + if (false == m_checkpoint_pos.has_value()) { // recover from a previous reset if necessary if (m_data_size > refill_size) { m_buffer = make_unique(refill_size); diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index f1293be88..4efb87a99 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -6,6 +6,7 @@ // C++ libraries #include #include +#include #include // Project headers @@ -212,8 +213,7 @@ class BufferedFileReader : public ReaderInterface { size_t m_buffer_aligned_mask; // Variables for checkpoint support - bool m_checkpoint_enabled; - size_t m_checkpoint_pos; + std::optional m_checkpoint_pos; }; From 6e5b013d1a09a569d24572361359a413e44e76dc Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 17 Jul 2023 08:23:22 -0400 Subject: [PATCH 042/121] BufferReader: Remove possibility for invalid internal buffer; Some clean-up. --- components/core/src/BufferReader.cpp | 28 ++++---- components/core/src/BufferReader.hpp | 29 ++------- .../core/tests/test-ir_encoding_methods.cpp | 65 ++++++++++--------- 3 files changed, 53 insertions(+), 69 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 78243404f..f8a4c6413 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -2,15 +2,13 @@ // C++ standard libraries #include -#include BufferReader::BufferReader (const char* data, size_t data_size) { - if (data == nullptr || data_size == 0) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + if (nullptr == data || 0 == data_size) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - m_data = data; - m_data_size = data_size; - m_cursor_pos = 0; + m_internal_buf = data; + m_internal_buf_size = data_size; } ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { @@ -18,28 +16,28 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_BadParam; } - auto remaining_data_size = m_data_size - m_cursor_pos; - if (remaining_data_size == 0) { + auto remaining_data_size = m_internal_buf_size - m_internal_buf_pos; + if (0 == remaining_data_size) { return ErrorCode_EndOfFile; } num_bytes_read = std::min(remaining_data_size, num_bytes_to_read); - auto copy_begin = m_data + m_cursor_pos; + auto copy_begin = m_internal_buf + m_internal_buf_pos; auto copy_end = copy_begin + num_bytes_read; std::copy(copy_begin, copy_end, buf); - m_cursor_pos += num_bytes_read; + m_internal_buf_pos += num_bytes_read; return ErrorCode_Success; } -[[nodiscard]] ErrorCode BufferReader::try_seek_from_begin (size_t pos) { - if (pos > m_data_size) { +ErrorCode BufferReader::try_seek_from_begin (size_t pos) { + if (pos > m_internal_buf_size) { return ErrorCode_OutOfBounds; } - m_cursor_pos = pos; + m_internal_buf_pos = pos; return ErrorCode_Success; } -[[nodiscard]] ErrorCode BufferReader::try_get_pos (size_t& pos) { - pos = m_cursor_pos; +ErrorCode BufferReader::try_get_pos (size_t& pos) { + pos = m_internal_buf_pos; return ErrorCode_Success; } diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 49908fbfc..6a9fece5e 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -5,7 +5,7 @@ #include "ReaderInterface.hpp" /** - * Class for reading from a fixed size in memory buffer + * Class for reading from a fixed-size in-memory buffer */ class BufferReader : public ReaderInterface { public: @@ -14,7 +14,7 @@ class BufferReader : public ReaderInterface { public: // Constructors OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + TraceableException (error_code, filename, line_number) {} // Methods [[nodiscard]] const char* what () const noexcept override { @@ -31,7 +31,6 @@ class BufferReader : public ReaderInterface { * @param buf * @param num_bytes_to_read The number of bytes to try and read * @param num_bytes_read The actual number of bytes read - * @return ErrorCode_NotInit if the buffer is not initialized * @return ErrorCode_BadParam if buf is invalid * @return ErrorCode_EndOfFile if buffer doesn't contain more data * @return ErrorCode_Success on success @@ -41,34 +40,20 @@ class BufferReader : public ReaderInterface { /** * Tries to seek from the beginning of the buffer to the given position * @param pos - * @return ErrorCode_NotInit if the buffer is not initialized * @return ErrorCode_OutOfBounds if the given position > the buffer's size * @return ErrorCode_Success on success */ [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; /** - * Tries to get the current position of the read head in the buffer * @param pos Returns the position of the read head in the buffer - * @return ErrorCode_NotInit if the buffer is not initialized - * @return ErrorCode_Success on success + * @return ErrorCode_Success */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; - /** - * Sets the underlying buffer for this reader. - * @param data - * @param data_size - **/ - void set_buffer (const char* data, size_t data_size) { - m_data = data; - m_data_size = data_size; - m_cursor_pos = 0; - } - private: - const char* m_data; - size_t m_data_size; - size_t m_cursor_pos; + const char* m_internal_buf; + size_t m_internal_buf_size; + size_t m_internal_buf_pos{0}; }; -#endif // BufferReader_HPP +#endif // BUFFERREADER_HPP diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 4d91afe6d..ad0b1f8be 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -211,10 +211,10 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { EightByteEncodingMagicNumber + MagicNumberLength}; // Test eight-byte encoding - BufferReader eight_byte_encoding_buffer ( - reinterpret_cast(eight_byte_encoding_vec.data()), + BufferReader eight_byte_encoding_buffer{ + size_checked_pointer_cast(eight_byte_encoding_vec.data()), eight_byte_encoding_vec.size() - ); + }; REQUIRE(get_encoding_type(eight_byte_encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -223,29 +223,29 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - BufferReader four_byte_encoding_buffer ( - reinterpret_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size() - ); + BufferReader four_byte_encoding_buffer{ + size_checked_pointer_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() + }; REQUIRE(get_encoding_type(four_byte_encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test error on incomplete ir_buffer - BufferReader incomplete_buffer ( - reinterpret_cast(four_byte_encoding_vec.data()), + BufferReader incomplete_buffer{ + size_checked_pointer_cast(four_byte_encoding_vec.data()), four_byte_encoding_vec.size() - 1 - ); + }; REQUIRE(get_encoding_type(incomplete_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; - BufferReader invalid_ir_buffer ( - reinterpret_cast(invalid_ir_vec.data()), + BufferReader invalid_ir_buffer{ + size_checked_pointer_cast(invalid_ir_vec.data()), invalid_ir_vec.size() - ); + }; REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -264,9 +264,9 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - BufferReader encoding_buffer ( - reinterpret_cast(ir_buf.data()), ir_buf.size() - ); + BufferReader encoding_buffer{ + size_checked_pointer_cast(ir_buf.data()), ir_buf.size() + }; bool is_four_bytes_encoding; REQUIRE(get_encoding_type(encoding_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); @@ -313,8 +313,8 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if incomplete IR can be detected ir_buf.resize(encoded_preamble_end_pos - 1); - BufferReader incomplete_preamble_buffer(reinterpret_cast(ir_buf.data()), - ir_buf.size()); + BufferReader incomplete_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; incomplete_preamble_buffer.seek_from_begin(MagicNumberLength); REQUIRE(decode_preamble( incomplete_preamble_buffer, metadata_type, metadata_pos, metadata_size) == @@ -322,8 +322,8 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; - BufferReader corrupted_preamble_buffer(reinterpret_cast(ir_buf.data()), - ir_buf.size()); + BufferReader corrupted_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; REQUIRE(decode_preamble( corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Corrupted_IR); @@ -345,7 +345,7 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", // Test if message can be decoded properly - BufferReader ir_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; string decoded_message; epoch_time_ms_t timestamp; @@ -362,9 +362,10 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", // Test incomplete IR ir_buf.resize(encoded_message_end_pos - 4); - ir_buffer.set_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + BufferReader incomplete_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR == - decode_next_message(ir_buffer, message, timestamp)); + decode_next_message(incomplete_preamble_buffer, message, timestamp)); } // NOTE: This test only tests eight_byte_encoded_variable_t because we trigger @@ -394,10 +395,10 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape {ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; - BufferReader ir_with_extra_escape_buffer ( - reinterpret_cast(ir_with_extra_escape.data()), + BufferReader ir_with_extra_escape_buffer{ + size_checked_pointer_cast(ir_with_extra_escape.data()), ir_with_extra_escape.size() - ); + }; REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_with_extra_escape_buffer, decoded_message, @@ -407,10 +408,10 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") auto ir_with_extra_placeholder{ir_buf}; ir_with_extra_placeholder.at(logtype_end_pos - 1) = enum_to_underlying_type(VariablePlaceholder::Dictionary); - BufferReader ir_with_extra_placeholder_buffer ( - reinterpret_cast(ir_with_extra_placeholder.data()), + BufferReader ir_with_extra_placeholder_buffer{ + size_checked_pointer_cast(ir_with_extra_placeholder.data()), ir_with_extra_placeholder.size() - ); + }; REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == decode_next_message(ir_with_extra_placeholder_buffer, decoded_message, @@ -427,7 +428,7 @@ TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_mes REQUIRE(true == encode_message(reference_delta_ts_negative, message, logtype, ir_buf)); - BufferReader ir_buffer(reinterpret_cast(ir_buf.data()), ir_buf.size()); + BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; string decoded_message; epoch_time_ms_t delta_ts; REQUIRE(IRErrorCode::IRErrorCode_Success == @@ -470,8 +471,8 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", reference_messages.push_back(message); reference_timestamps.push_back(ts); - BufferReader complete_encoding_buffer(reinterpret_cast(ir_buf.data()), - ir_buf.size()); + BufferReader complete_encoding_buffer{size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; bool is_four_bytes_encoding; REQUIRE(get_encoding_type(complete_encoding_buffer, is_four_bytes_encoding) == From ff520539588a78ea296026d0152d5b0d51b89f0d Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:22:22 -0400 Subject: [PATCH 043/121] Initial change that utilizes BufferReader in the code. --- components/core/src/BufferReader.cpp | 39 ++++++ components/core/src/BufferReader.hpp | 15 +++ components/core/src/BufferedFileReader.cpp | 142 +++++++++++---------- components/core/src/BufferedFileReader.hpp | 7 +- 4 files changed, 134 insertions(+), 69 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index f8a4c6413..14a3e6ff9 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -3,6 +3,9 @@ // C++ standard libraries #include +#include +#include + BufferReader::BufferReader (const char* data, size_t data_size) { if (nullptr == data || 0 == data_size) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); @@ -37,7 +40,43 @@ ErrorCode BufferReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } +ErrorCode BufferReader::try_seek_from_current (off_t offset) { + if (m_internal_buf_pos + offset > m_internal_buf_size) { + return ErrorCode_OutOfBounds; + } + m_internal_buf_pos += offset; + return ErrorCode_Success; +} + ErrorCode BufferReader::try_get_pos (size_t& pos) { pos = m_internal_buf_pos; return ErrorCode_Success; } + +void BufferReader::peek_buffer (size_t size_to_peek, const char*& data_ptr, size_t& peek_size) { + peek_size = std::min(size_to_peek, m_internal_buf_size - m_internal_buf_pos); + data_ptr = m_internal_buf + m_internal_buf_pos; +} + +ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, + std::string& str, size_t& length) { + // find the pointer pointing to the delimiter + const char* buffer_head = m_internal_buf + m_internal_buf_pos; + const char* delim_ptr = reinterpret_cast( + memchr(buffer_head, delim, m_internal_buf_size - m_internal_buf_pos) + ); + ErrorCode ret_code; + size_t delim_pos; + if (delim_ptr != nullptr) { + delim_pos = (delim_ptr - m_internal_buf) + 1; + ret_code = ErrorCode_Success; + } else { + delim_pos = m_internal_buf_size; + ret_code = ErrorCode_EndOfFile; + } + // append to strings + length = delim_pos - m_internal_buf_pos; + str.append(buffer_head, length); + m_internal_buf_pos = delim_pos; + return ret_code; +} \ No newline at end of file diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 6a9fece5e..ccc877bf5 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -44,12 +44,27 @@ class BufferReader : public ReaderInterface { * @return ErrorCode_Success on success */ [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + /** + * Tries to seek from the current pos of the buffer by the given amount + * @param pos + * @return ErrorCode_OutOfBounds if the offset exceeds the buffer's size + * @return ErrorCode_Success on success + */ + [[nodiscard]] ErrorCode try_seek_from_current (off_t offset); /** * @param pos Returns the position of the read head in the buffer * @return ErrorCode_Success */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + // Helper functions + [[nodiscard]] size_t get_buffer_size() const { return m_internal_buf_size; } + + void peek_buffer (size_t size_to_peek, const char*& data_ptr, size_t& peek_size); + + ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, + bool append, std::string& str, size_t& length); + private: const char* m_internal_buf; size_t m_internal_buf_size; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 438ad93fa..a32576d44 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -63,12 +63,14 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { pos, m_checkpoint_pos.value()); return ErrorCode_Failure; } + m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); } else { - auto buffer_available_data = remaining_data_size(); - auto seek_distance = pos - m_file_pos; - if (seek_distance <= buffer_available_data) { - m_file_pos = pos; - return ErrorCode_Success; + off_t seek_distance = pos - m_file_pos; + if (m_buffer_reader.has_value()) { + if (ErrorCode_Success == m_buffer_reader->try_seek_from_current(seek_distance)) { + m_file_pos = pos; + return ErrorCode_Success; + } } // Handle the case where buffer doesn't contain enough data for seek if (false == m_checkpoint_pos.has_value()) { @@ -78,9 +80,10 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_errno; } // invalidate buffered_data - m_data_size = 0; + m_buffer_reader.reset(); } else { - size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + m_data_size); + auto data_size = get_data_size(); + size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + data_size); size_t quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); size_t num_bytes_refilled {0}; @@ -92,6 +95,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (ErrorCode_Success != error_code) { return error_code; } + m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); } } @@ -109,19 +113,21 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, } num_bytes_read = 0; - size_t num_bytes_to_read_from_buffer {num_bytes_to_read}; // keep reading until enough data is read or an eof is seen while (true) { - auto available_bytes_for_read = std::min(num_bytes_to_read_from_buffer, - remaining_data_size()); - memcpy(buf + num_bytes_read, buffer_head(), available_bytes_for_read); - - num_bytes_to_read_from_buffer -= available_bytes_for_read; - num_bytes_read += available_bytes_for_read; + if (m_buffer_reader.has_value()) { + size_t bytes_read {0}; + auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; + ErrorCode error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); + if (ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { + return error_code; + } + num_bytes_read += bytes_read; + m_file_pos += bytes_read; - m_file_pos += available_bytes_for_read; - if (num_bytes_to_read_from_buffer == 0) { - break; + if (num_bytes_read == num_bytes_to_read) { + break; + } } // refill the buffer if more bytes are to be read auto error_code = refill_reader_buffer(m_buffer_size); @@ -131,7 +137,6 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, return error_code; } } - if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } @@ -149,23 +154,14 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } bool found_delim {false}; - size_t delim_pos; while (false == found_delim) { - // find the pointer pointing to the delimiter - const char* delim_ptr = reinterpret_cast( - memchr(buffer_head(),delim, remaining_data_size()) - ); - if (delim_ptr != nullptr) { - delim_pos = (delim_ptr - m_buffer.get()) + 1; - found_delim = true; - } else { - delim_pos = m_data_size; + if (m_buffer_reader.has_value()) { + size_t length; + if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { + found_delim = true; + } + m_file_pos += length; } - // append to strings - size_t str_length = delim_pos - cursor_pos(); - str.append(buffer_head(), str_length); - - m_file_pos += str_length; if (false == found_delim) { if (auto error_code = refill_reader_buffer(m_buffer_size); ErrorCode_Success != error_code) { @@ -189,8 +185,8 @@ ErrorCode BufferedFileReader::try_open (const string& path) { } m_path = path; m_file_pos = 0; - m_data_size = 0; m_buffer_begin_pos = 0; + m_buffer_reader.reset(); return ErrorCode_Success; } @@ -247,17 +243,23 @@ void BufferedFileReader::reset_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } - if (m_data_size != m_buffer_size) { - // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = cursor_pos() & m_buffer_aligned_mask; - m_data_size -= copy_pos; - // Use a quantized size for the new buffer size - auto new_buffer_size = quantize_to_buffer_size(m_data_size); - - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), &m_buffer[copy_pos], m_data_size); - m_buffer = std::move(new_buffer); - m_buffer_begin_pos += copy_pos; + if (m_buffer_reader.has_value()) { + auto data_size = m_buffer_reader->get_buffer_size(); + if (data_size != m_buffer_size) { + // allocate new buffer for buffered data that hasn't been seek passed + auto copy_pos = m_buffer_reader->get_pos() & m_buffer_aligned_mask; + auto new_data_size = data_size - copy_pos; + // Use a quantized size for the new buffer size + auto new_buffer_size = quantize_to_buffer_size(new_data_size); + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), &m_buffer[copy_pos], new_data_size); + m_buffer = std::move(new_buffer); + + m_buffer_begin_pos += copy_pos; + m_buffer_reader.emplace(m_buffer.get(), new_data_size); + m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); + } } m_checkpoint_pos.reset(); } @@ -295,7 +297,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha return ErrorCode_NotInit; } // Refill the buffer if necessary - if (0 == m_data_size) { + if (false == m_buffer_reader.has_value()) { auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_Success != error_code) { data_ptr = nullptr; @@ -303,17 +305,15 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha return error_code; } } - peek_size = std::min(size_to_peek, remaining_data_size()); - data_ptr = buffer_head(); + m_buffer_reader->peek_buffer(size_to_peek, data_ptr, peek_size); return ErrorCode_Success; } -size_t BufferedFileReader::remaining_data_size () const { - if (m_data_size == 0) { +size_t BufferedFileReader::get_data_size () const { + if (false == m_buffer_reader.has_value()) { return 0; } - assert(m_data_size >= cursor_pos()); - return m_data_size - cursor_pos(); + return m_buffer_reader->get_buffer_size(); } size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { @@ -329,8 +329,9 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { num_bytes_refilled = 0; if (false == m_checkpoint_pos.has_value()) { + auto data_size = get_data_size(); // recover from a previous reset if necessary - if (m_data_size > refill_size) { + if (data_size > refill_size) { m_buffer = make_unique(refill_size); } auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), @@ -338,21 +339,32 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, if (error_code != ErrorCode_Success) { return error_code; } - m_buffer_begin_pos = m_file_pos & m_buffer_aligned_mask; - m_data_size = num_bytes_refilled; + m_buffer_begin_pos += data_size; + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } else { - // Messy way of copying data from old buffer to new buffer - auto new_buffer = make_unique(m_data_size + refill_size); - memcpy(new_buffer.get(), m_buffer.get(), m_data_size); - auto error_code = try_read_into_buffer(m_fd, &new_buffer[m_data_size], refill_size, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; + if (m_buffer_reader.has_value()) { + // Messy way of copying data from old buffer to new buffer + auto data_size = m_buffer_reader->get_buffer_size(); + auto new_buffer = make_unique(data_size + refill_size); + memcpy(new_buffer.get(), m_buffer.get(), data_size); + auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer = std::move(new_buffer); + data_size += num_bytes_refilled; + m_buffer_reader.emplace(m_buffer.get(), data_size); + } else { + auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } - m_buffer = std::move(new_buffer); - m_data_size += num_bytes_refilled; - } + m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 4efb87a99..f30d8b90e 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -10,6 +10,7 @@ #include // Project headers +#include "BufferReader.hpp" #include "Defs.h" #include "ErrorCode.hpp" #include "ReaderInterface.hpp" @@ -160,9 +161,7 @@ class BufferedFileReader : public ReaderInterface { private: // Methods - [[nodiscard]] size_t cursor_pos() const { return m_file_pos - m_buffer_begin_pos; } - [[nodiscard]] char* buffer_head() const { return m_buffer.get() + cursor_pos(); } - [[nodiscard]] size_t remaining_data_size() const; + [[nodiscard]] size_t get_data_size() const; /** * Quantize the given size to be the next integer multiple of buffer_size @@ -204,7 +203,7 @@ class BufferedFileReader : public ReaderInterface { // Buffer specific data std::unique_ptr m_buffer; - size_t m_data_size; + std::optional m_buffer_reader; size_t m_buffer_begin_pos; // Values for buffer related calculation From 47f95734e716a7cf88d9acdcdd8f5a8617c908b4 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:41:35 -0400 Subject: [PATCH 044/121] Small fix for read_to_delimiter --- components/core/src/BufferReader.cpp | 7 +++++++ components/core/src/BufferedFileReader.cpp | 17 ++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 14a3e6ff9..89364e545 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -60,6 +60,10 @@ void BufferReader::peek_buffer (size_t size_to_peek, const char*& data_ptr, size ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str, size_t& length) { + + if (false == append) { + str.clear(); + } // find the pointer pointing to the delimiter const char* buffer_head = m_internal_buf + m_internal_buf_pos; const char* delim_ptr = reinterpret_cast( @@ -76,6 +80,9 @@ ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, } // append to strings length = delim_pos - m_internal_buf_pos; + if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { + --length; + } str.append(buffer_head, length); m_internal_buf_pos = delim_pos; return ret_code; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index a32576d44..0e3069ce5 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -149,22 +149,25 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim return ErrorCode_NotInit; } - if (false == append) { - str.clear(); - } - bool found_delim {false}; + size_t read_size {0}; while (false == found_delim) { if (m_buffer_reader.has_value()) { - size_t length; + size_t length {0}; if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { found_delim = true; } m_file_pos += length; + read_size += length; } if (false == found_delim) { - if (auto error_code = refill_reader_buffer(m_buffer_size); - ErrorCode_Success != error_code) { + auto error_code = refill_reader_buffer(m_buffer_size); + if (ErrorCode_EndOfFile == error_code) { + if (read_size == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; + } else if (ErrorCode_Success != error_code) { return error_code; } } From 16637d1911418e6e5ba64854ccdd7f73d528f3a2 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:30:08 -0400 Subject: [PATCH 045/121] simplification --- components/core/src/BufferReader.cpp | 8 -- components/core/src/BufferReader.hpp | 7 -- components/core/src/BufferedFileReader.cpp | 116 ++++++++++++--------- 3 files changed, 65 insertions(+), 66 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 89364e545..d3c0fec09 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -40,14 +40,6 @@ ErrorCode BufferReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } -ErrorCode BufferReader::try_seek_from_current (off_t offset) { - if (m_internal_buf_pos + offset > m_internal_buf_size) { - return ErrorCode_OutOfBounds; - } - m_internal_buf_pos += offset; - return ErrorCode_Success; -} - ErrorCode BufferReader::try_get_pos (size_t& pos) { pos = m_internal_buf_pos; return ErrorCode_Success; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index ccc877bf5..66004b0f3 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -44,13 +44,6 @@ class BufferReader : public ReaderInterface { * @return ErrorCode_Success on success */ [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; - /** - * Tries to seek from the current pos of the buffer by the given amount - * @param pos - * @return ErrorCode_OutOfBounds if the offset exceeds the buffer's size - * @return ErrorCode_Success on success - */ - [[nodiscard]] ErrorCode try_seek_from_current (off_t offset); /** * @param pos Returns the position of the read head in the buffer * @return ErrorCode_Success diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 0e3069ce5..88f9466f8 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -63,23 +63,23 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { pos, m_checkpoint_pos.value()); return ErrorCode_Failure; } + // adjust the buffer reader pos m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); } else { - off_t seek_distance = pos - m_file_pos; if (m_buffer_reader.has_value()) { - if (ErrorCode_Success == m_buffer_reader->try_seek_from_current(seek_distance)) { + if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(pos - m_buffer_begin_pos)) { m_file_pos = pos; return ErrorCode_Success; } } - // Handle the case where buffer doesn't contain enough data for seek + // Handle the case where buffer is empty or doesn't contain enough data for seek if (false == m_checkpoint_pos.has_value()) { + // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader m_buffer_begin_pos = pos & m_buffer_aligned_mask; auto offset = lseek(m_fd, m_buffer_begin_pos, SEEK_SET); if (offset == -1) { return ErrorCode_errno; } - // invalidate buffered_data m_buffer_reader.reset(); } else { auto data_size = get_data_size(); @@ -113,24 +113,29 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, } num_bytes_read = 0; + if (false == m_buffer_reader.has_value()) { + // refill the buffer if not initialized + auto error_code = refill_reader_buffer(m_buffer_size); + if (ErrorCode_Success != error_code) { + return error_code; + } + } // keep reading until enough data is read or an eof is seen while (true) { - if (m_buffer_reader.has_value()) { - size_t bytes_read {0}; - auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; - ErrorCode error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); - if (ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { - return error_code; - } - num_bytes_read += bytes_read; - m_file_pos += bytes_read; + size_t bytes_read {0}; + auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; + auto error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); + if (ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { + return error_code; + } + num_bytes_read += bytes_read; + m_file_pos += bytes_read; - if (num_bytes_read == num_bytes_to_read) { - break; - } + if (num_bytes_read == num_bytes_to_read) { + break; } // refill the buffer if more bytes are to be read - auto error_code = refill_reader_buffer(m_buffer_size); + error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { break; } else if (ErrorCode_Success != error_code) { @@ -151,15 +156,21 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim bool found_delim {false}; size_t read_size {0}; + if (false == m_buffer_reader.has_value()) { + // refill the buffer if not initialized + auto error_code = refill_reader_buffer(m_buffer_size); + if (ErrorCode_Success != error_code) { + return error_code; + } + } while (false == found_delim) { - if (m_buffer_reader.has_value()) { - size_t length {0}; - if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { - found_delim = true; - } - m_file_pos += length; - read_size += length; + size_t length {0}; + if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { + found_delim = true; } + m_file_pos += length; + read_size += length; + if (false == found_delim) { auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { @@ -246,23 +257,26 @@ void BufferedFileReader::reset_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } - if (m_buffer_reader.has_value()) { - auto data_size = m_buffer_reader->get_buffer_size(); - if (data_size != m_buffer_size) { - // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = m_buffer_reader->get_pos() & m_buffer_aligned_mask; - auto new_data_size = data_size - copy_pos; - // Use a quantized size for the new buffer size - auto new_buffer_size = quantize_to_buffer_size(new_data_size); - - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), &m_buffer[copy_pos], new_data_size); - m_buffer = std::move(new_buffer); + if (false == m_buffer_reader.has_value()) { + m_checkpoint_pos.reset(); + return; + } - m_buffer_begin_pos += copy_pos; - m_buffer_reader.emplace(m_buffer.get(), new_data_size); - m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); - } + auto data_size = m_buffer_reader->get_buffer_size(); + if (data_size != m_buffer_size) { + // allocate new buffer for buffered data that hasn't been seek passed + auto copy_pos = m_buffer_reader->get_pos() & m_buffer_aligned_mask; + auto new_data_size = data_size - copy_pos; + // Use a quantized size for the new buffer size + auto new_buffer_size = quantize_to_buffer_size(new_data_size); + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), &m_buffer[copy_pos], new_data_size); + m_buffer = std::move(new_buffer); + + m_buffer_begin_pos += copy_pos; + m_buffer_reader.emplace(m_buffer.get(), new_data_size); + m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); } m_checkpoint_pos.reset(); } @@ -345,7 +359,14 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, m_buffer_begin_pos += data_size; m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } else { - if (m_buffer_reader.has_value()) { + if (false == m_buffer_reader.has_value()) { + auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); + } else { // Messy way of copying data from old buffer to new buffer auto data_size = m_buffer_reader->get_buffer_size(); auto new_buffer = make_unique(data_size + refill_size); @@ -358,13 +379,6 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, m_buffer = std::move(new_buffer); data_size += num_bytes_refilled; m_buffer_reader.emplace(m_buffer.get(), data_size); - } else { - auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } } m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); @@ -376,7 +390,8 @@ static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_ num_bytes_read = 0; // keep reading from the fd until seeing a 0, which means eof while (true) { - auto bytes_read = ::read(fd, buffer + num_bytes_read, num_bytes_to_read); + size_t remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; + auto bytes_read = ::read(fd, buffer + num_bytes_read, remaining_bytes_to_read); if (bytes_read == -1) { return ErrorCode_errno; } @@ -384,8 +399,7 @@ static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_ break; } num_bytes_read += bytes_read; - num_bytes_to_read -= bytes_read; - if (num_bytes_to_read == 0) { + if (num_bytes_read == num_bytes_to_read) { return ErrorCode_Success; } } From 3f8156f0d8afef1729aa3058bf73a80444c97cd2 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 25 Jul 2023 21:04:29 -0400 Subject: [PATCH 046/121] Try to add some comments --- components/core/src/BufferedFileReader.hpp | 37 ++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index f30d8b90e..b9670219c 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -17,7 +17,11 @@ #include "TraceableException.hpp" /** - * Class for reading from a on-disk file with custom buffering + * Class for reading from a on-disk file with custom buffering. + * The BufferedFileReader is designed to support files that only allow + * sequential access, such as files in S3. The class uses a checkpoint + * mechanism to support seeking and reading from a previous file position + * without having to actually accessing the file. */ class BufferedFileReader : public ReaderInterface { public: @@ -147,15 +151,36 @@ class BufferedFileReader : public ReaderInterface { size_t& peek_size); /** - * Sets a checkpoint pos. the BufferedFileReader guarantees that - * all data after checkpoint pos will be buffered in the memory and - * support seek. + * Sets a checkpoint at the current file pos. + * By default, the checkpoint is not set and the BufferedFileReader only + * maintains a fixed size buffer. Seeking before the reading pos is not + * supported since the data might not be in the buffer anymore. + * + * When the checkpoint is set, the BufferedFileReader increases its + * internal buffer size on demand and buffer all data between the + * checkpoint pos and largest ever file_pos in the memory. + * It then support seeking back to a previous file pos that's after the + * checkpoint pos, as the data is guaranteed to be available in the internal + * buffer. + * + * Note: Setting a checkpoint may result in higher memory usage since + * the BufferedFileReader needs to exhaustively buffer the data it reads + * in the buffer. * @return current file pos */ size_t mark_pos(); /** * Disable the checkpoint pos and release buffered data from memory + * The function resize the internal buffer based on the following rules. + * 1. If the current reading_pos is within the same m_buffer_size region as + * the buffer end pos (the file pos that end of buffer corresponds to). i.e. + * buffer_end_pos - file_pos < m_buffer_size + * the buffer will be resized to m_buffer_size bytes + * 2. Else, The buffer will be resized to the rounded result of + * quantizing (buffer_end_pos - file_pos) to the nearest multiple of + * 'm_buffer_size' using the rounding method. This ensures that the current + * read pos still resides in the resized buffer */ void reset_checkpoint (); @@ -197,9 +222,9 @@ class BufferedFileReader : public ReaderInterface { static constexpr size_t cDefaultBufferSize = 65536; // Variables - size_t m_file_pos; int m_fd; std::string m_path; + size_t m_file_pos; // Buffer specific data std::unique_ptr m_buffer; @@ -216,4 +241,4 @@ class BufferedFileReader : public ReaderInterface { }; -#endif // BufferedFileReader +#endif // BufferedFileReader_HPP From e58742730d6d588c886cb7a93d1201003938ef61 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 26 Jul 2023 18:41:13 -0400 Subject: [PATCH 047/121] Allow size=0 for BufferReader and simply part of the code --- components/core/src/BufferReader.cpp | 2 +- components/core/src/BufferedFileReader.cpp | 90 +++++++--------------- components/core/src/BufferedFileReader.hpp | 2 - 3 files changed, 30 insertions(+), 64 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index d3c0fec09..58afb7ac2 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -7,7 +7,7 @@ #include BufferReader::BufferReader (const char* data, size_t data_size) { - if (nullptr == data || 0 == data_size) { + if (nullptr == data) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } m_internal_buf = data; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 88f9466f8..cb2925d76 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -66,11 +66,9 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { // adjust the buffer reader pos m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); } else { - if (m_buffer_reader.has_value()) { - if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(pos - m_buffer_begin_pos)) { - m_file_pos = pos; - return ErrorCode_Success; - } + if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(pos - m_buffer_begin_pos)) { + m_file_pos = pos; + return ErrorCode_Success; } // Handle the case where buffer is empty or doesn't contain enough data for seek if (false == m_checkpoint_pos.has_value()) { @@ -80,9 +78,9 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (offset == -1) { return ErrorCode_errno; } - m_buffer_reader.reset(); + m_buffer_reader.emplace(m_buffer.get(), 0); } else { - auto data_size = get_data_size(); + auto data_size = m_buffer_reader->get_buffer_size(); size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + data_size); size_t quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); @@ -113,18 +111,13 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, } num_bytes_read = 0; - if (false == m_buffer_reader.has_value()) { - // refill the buffer if not initialized - auto error_code = refill_reader_buffer(m_buffer_size); - if (ErrorCode_Success != error_code) { - return error_code; - } - } // keep reading until enough data is read or an eof is seen while (true) { size_t bytes_read {0}; auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; auto error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); + // here EOF is allowed because it simply means we have exhausted the + // buffer, but not necessarily the file itself if (ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { return error_code; } @@ -155,26 +148,19 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } bool found_delim {false}; - size_t read_size {0}; - if (false == m_buffer_reader.has_value()) { - // refill the buffer if not initialized - auto error_code = refill_reader_buffer(m_buffer_size); - if (ErrorCode_Success != error_code) { - return error_code; - } - } + size_t total_append_length {0}; while (false == found_delim) { size_t length {0}; if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { found_delim = true; } m_file_pos += length; - read_size += length; + total_append_length += length; if (false == found_delim) { auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { - if (read_size == 0) { + if (total_append_length == 0) { return ErrorCode_EndOfFile; } return ErrorCode_Success; @@ -200,7 +186,7 @@ ErrorCode BufferedFileReader::try_open (const string& path) { m_path = path; m_file_pos = 0; m_buffer_begin_pos = 0; - m_buffer_reader.reset(); + m_buffer_reader.emplace(m_buffer.get(), 0); return ErrorCode_Success; } @@ -257,11 +243,6 @@ void BufferedFileReader::reset_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } - if (false == m_buffer_reader.has_value()) { - m_checkpoint_pos.reset(); - return; - } - auto data_size = m_buffer_reader->get_buffer_size(); if (data_size != m_buffer_size) { // allocate new buffer for buffered data that hasn't been seek passed @@ -313,7 +294,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha if (-1 == m_fd) { return ErrorCode_NotInit; } - // Refill the buffer if necessary + // Refill the buffer if it is not loaded yet if (false == m_buffer_reader.has_value()) { auto error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_Success != error_code) { @@ -326,13 +307,6 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha return ErrorCode_Success; } -size_t BufferedFileReader::get_data_size () const { - if (false == m_buffer_reader.has_value()) { - return 0; - } - return m_buffer_reader->get_buffer_size(); -} - size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { return (1 + ((size - 1) >> m_buffer_exp)) << m_buffer_exp; } @@ -346,7 +320,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, size_t& num_bytes_refilled) { num_bytes_refilled = 0; if (false == m_checkpoint_pos.has_value()) { - auto data_size = get_data_size(); + auto data_size = m_buffer_reader->get_buffer_size(); // recover from a previous reset if necessary if (data_size > refill_size) { m_buffer = make_unique(refill_size); @@ -359,28 +333,22 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, m_buffer_begin_pos += data_size; m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } else { - if (false == m_buffer_reader.has_value()) { - auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), refill_size, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); - } else { - // Messy way of copying data from old buffer to new buffer - auto data_size = m_buffer_reader->get_buffer_size(); - auto new_buffer = make_unique(data_size + refill_size); - memcpy(new_buffer.get(), m_buffer.get(), data_size); - auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], refill_size, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_buffer = std::move(new_buffer); - data_size += num_bytes_refilled; - m_buffer_reader.emplace(m_buffer.get(), data_size); + // Messy way of copying data from old buffer to new buffer + auto data_size = m_buffer_reader->get_buffer_size(); + auto new_buffer = make_unique(data_size + refill_size); + memcpy(new_buffer.get(), m_buffer.get(), data_size); + + // Read data to the new buffer, with offset = data_size + auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], refill_size, + num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; } + m_buffer = std::move(new_buffer); + m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); } + // this line is here to handle if we have seek to a position + // before calling refill. if we m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); return ErrorCode_Success; } @@ -388,9 +356,9 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; - // keep reading from the fd until seeing a 0, which means eof + // keep reading from the fd until enough bytes are read while (true) { - size_t remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; + auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; auto bytes_read = ::read(fd, buffer + num_bytes_read, remaining_bytes_to_read); if (bytes_read == -1) { return ErrorCode_errno; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index b9670219c..98af81ea2 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -186,8 +186,6 @@ class BufferedFileReader : public ReaderInterface { private: // Methods - [[nodiscard]] size_t get_data_size() const; - /** * Quantize the given size to be the next integer multiple of buffer_size * @param size From 8bf17723bdb6132776e1d4c431b6900a97fff265 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 19:27:31 -0400 Subject: [PATCH 048/121] simplification by not requiring buffer to start from an aligned pos --- components/core/src/BufferedFileReader.cpp | 88 ++++++++++--------- components/core/src/BufferedFileReader.hpp | 13 +-- .../core/tests/test-BufferedFileReader.cpp | 32 ++++--- 3 files changed, 77 insertions(+), 56 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index cb2925d76..5eb27fb5f 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -68,24 +68,24 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } else { if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(pos - m_buffer_begin_pos)) { m_file_pos = pos; + highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; } // Handle the case where buffer is empty or doesn't contain enough data for seek if (false == m_checkpoint_pos.has_value()) { // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader - m_buffer_begin_pos = pos & m_buffer_aligned_mask; - auto offset = lseek(m_fd, m_buffer_begin_pos, SEEK_SET); + auto offset = lseek(m_fd, pos, SEEK_SET); if (offset == -1) { return ErrorCode_errno; } m_buffer_reader.emplace(m_buffer.get(), 0); + m_buffer_begin_pos = pos; } else { auto data_size = m_buffer_reader->get_buffer_size(); size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + data_size); - size_t quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); size_t num_bytes_refilled {0}; - auto error_code = refill_reader_buffer(quantized_refill_size, num_bytes_refilled); + auto error_code = refill_reader_buffer(num_bytes_to_refill, num_bytes_refilled); if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { SPDLOG_ERROR("not expecting to seek pass the Entire file"); throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); @@ -96,8 +96,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); } } - m_file_pos = pos; + highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -130,7 +130,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, // refill the buffer if more bytes are to be read error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { - break; + break; } else if (ErrorCode_Success != error_code) { return error_code; } @@ -138,6 +138,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, if (num_bytes_read == 0) { return ErrorCode_EndOfFile; } + highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -169,6 +170,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } } } + highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -210,7 +212,6 @@ void BufferedFileReader::close () { m_fd = -1; if (m_checkpoint_pos.has_value()) { - SPDLOG_DEBUG("close file without resetting checkpoint"); m_buffer = make_unique(m_buffer_size); m_checkpoint_pos.reset(); } @@ -229,36 +230,48 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { return ErrorCode_Success; } -size_t BufferedFileReader::mark_pos() { +void BufferedFileReader::resize_buffer_from_pos (size_t pos) { + + const auto copy_size = m_buffer_reader->get_buffer_size() - pos; + // Use a quantized size for the underlying buffer size + auto new_buffer_size = quantize_to_buffer_size(copy_size); + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), &m_buffer[pos], copy_size); + m_buffer = std::move(new_buffer); + m_buffer_begin_pos += pos; + + m_buffer_reader.emplace(m_buffer.get(), copy_size); +} + +size_t BufferedFileReader::set_checkpoint() { if (m_checkpoint_pos.has_value()) { - SPDLOG_ERROR("mark_pos can not be called twice"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + if (m_checkpoint_pos > m_file_pos) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } else if (m_checkpoint_pos < m_file_pos) { + if (m_buffer_reader->get_buffer_size() != m_buffer_size) { + // allocate new buffer for buffered data starting from pos + resize_buffer_from_pos(m_buffer_reader->get_pos()); + m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); + } + } } m_checkpoint_pos = m_file_pos; return m_file_pos; } -void BufferedFileReader::reset_checkpoint () { - // alternatively, we can keep claiming back the memory +void BufferedFileReader::clear_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } - auto data_size = m_buffer_reader->get_buffer_size(); - if (data_size != m_buffer_size) { - // allocate new buffer for buffered data that hasn't been seek passed - auto copy_pos = m_buffer_reader->get_pos() & m_buffer_aligned_mask; - auto new_data_size = data_size - copy_pos; - // Use a quantized size for the new buffer size - auto new_buffer_size = quantize_to_buffer_size(new_data_size); - - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), &m_buffer[copy_pos], new_data_size); - m_buffer = std::move(new_buffer); - - m_buffer_begin_pos += copy_pos; - m_buffer_reader.emplace(m_buffer.get(), new_data_size); - m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); + const auto buffer_end_file_pos = m_buffer_reader->get_buffer_size() + m_buffer_begin_pos; + if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_buffer_size) { + throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); } + + m_file_pos = highest_read_pos; + const auto copy_pos = m_file_pos - m_buffer_begin_pos; + resize_buffer_from_pos(copy_pos); m_checkpoint_pos.reset(); } @@ -285,7 +298,6 @@ ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { m_buffer_exp = static_cast(log2(static_cast(buffer_size))); m_buffer_size = buffer_size; - m_buffer_aligned_mask = ~(m_buffer_size - 1); return ErrorCode_Success; } @@ -316,40 +328,34 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { return refill_reader_buffer (refill_size, num_bytes_refilled); } -ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size, +ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill, size_t& num_bytes_refilled) { num_bytes_refilled = 0; + const auto quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); if (false == m_checkpoint_pos.has_value()) { - auto data_size = m_buffer_reader->get_buffer_size(); - // recover from a previous reset if necessary - if (data_size > refill_size) { - m_buffer = make_unique(refill_size); - } auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), - refill_size, num_bytes_refilled); + quantized_refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } - m_buffer_begin_pos += data_size; m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); + m_buffer_begin_pos = m_file_pos; } else { // Messy way of copying data from old buffer to new buffer auto data_size = m_buffer_reader->get_buffer_size(); - auto new_buffer = make_unique(data_size + refill_size); + auto new_buffer = make_unique(data_size + quantized_refill_size); memcpy(new_buffer.get(), m_buffer.get(), data_size); // Read data to the new buffer, with offset = data_size - auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], refill_size, + auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], quantized_refill_size, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } m_buffer = std::move(new_buffer); m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); + m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); } - // this line is here to handle if we have seek to a position - // before calling refill. if we - m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 98af81ea2..b8d7295c2 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -168,7 +168,7 @@ class BufferedFileReader : public ReaderInterface { * in the buffer. * @return current file pos */ - size_t mark_pos(); + size_t set_checkpoint(); /** * Disable the checkpoint pos and release buffered data from memory @@ -182,7 +182,7 @@ class BufferedFileReader : public ReaderInterface { * 'm_buffer_size' using the rounding method. This ensures that the current * read pos still resides in the resized buffer */ - void reset_checkpoint (); + void clear_checkpoint (); private: // Methods @@ -209,12 +209,14 @@ class BufferedFileReader : public ReaderInterface { /** * Similar to refill_reader_buffer, except that number of bytes refilled * is returned by reference - * @param refill_size + * @param num_bytes_to_refill * @param num_bytes_refilled Returns the number of bytes refilled by * reference * @return Same as refill_reader_buffer(size_t refill_size) */ - [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size, size_t& num_bytes_refilled); + [[nodiscard]] ErrorCode refill_reader_buffer(size_t num_bytes_to_refill, size_t& num_bytes_refilled); + + void resize_buffer_from_pos(size_t pos); // Constants static constexpr size_t cDefaultBufferSize = 65536; @@ -232,10 +234,11 @@ class BufferedFileReader : public ReaderInterface { // Values for buffer related calculation size_t m_buffer_exp; size_t m_buffer_size; - size_t m_buffer_aligned_mask; // Variables for checkpoint support std::optional m_checkpoint_pos; + size_t highest_read_pos {0}; + }; diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index b875bfb5a..1b399157d 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -106,7 +106,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(file_reader.get_pos() == num_bytes_to_read_1); // set a checkpoint - size_t checkpoint_pos = file_reader.mark_pos(); + size_t checkpoint_pos = file_reader.set_checkpoint(); // keep reading some data size_t num_bytes_to_read_2 = 345212; @@ -136,21 +136,31 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { num_byte_read)); REQUIRE(num_bytes_to_read_3 == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read_3)); + // update the latest_file_pos + latest_file_pos = file_reader.get_pos(); - // reset, and then seek back should fail - file_reader.reset_checkpoint(); + // seek back to somewhere between the checkpoint and latest data, and set a new checkpoint + file_reader.seek_from_begin((latest_file_pos + checkpoint_pos) / 2); + file_reader.set_checkpoint(); + // the previous seek_pos should be unavailable REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos_1)); - // make sure data read after checkpoint-reset are still correct - size_t num_bytes_to_read_4 = 65780; + // make sure data read after checkpoint-set are still correct + size_t num_bytes_to_read_4 = 4096; REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_4, num_byte_read)); REQUIRE(num_bytes_to_read_4 == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos + num_bytes_to_read_3, + REQUIRE(0 == memcmp(read_buffer, test_data + (latest_file_pos + checkpoint_pos) / 2, num_bytes_to_read_4)); - // Make sure now we can't reset back to checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos_1)); + + file_reader.clear_checkpoint(); + size_t default_buffer_size = 65536; + // make sure data read after checkpoint-reset are still correct; + REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, default_buffer_size, + num_byte_read)); + REQUIRE(default_buffer_size == num_byte_read); + REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, default_buffer_size)); } SECTION("seek with delayed read") { @@ -158,10 +168,11 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { file_reader.open(test_file_path); // first, advance to some random file_pos - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(45313)); + size_t begin_read_pos = 45313; + REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(begin_read_pos)); // set a checkpoint - size_t checkpoint_pos = file_reader.mark_pos(); + size_t checkpoint_pos = file_reader.set_checkpoint(); // keep reading some data size_t num_bytes_to_read; @@ -171,6 +182,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); + REQUIRE(0 == memcmp(read_buffer, test_data + begin_read_pos, num_bytes_to_read)); // now seek back to some where between size_t seek_pos = file_reader.get_pos() / 2; From d8023104214ad76a5121b882d3a7b4726f762b84 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 19:41:56 -0400 Subject: [PATCH 049/121] My way of simplification --- components/core/src/BufferedFileReader.cpp | 16 +++++++--------- components/core/src/BufferedFileReader.hpp | 2 ++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 5eb27fb5f..07f83ac3f 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -64,9 +64,9 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Failure; } // adjust the buffer reader pos - m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); + m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(pos)); } else { - if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(pos - m_buffer_begin_pos)) { + if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(get_equivalent_buffer_pos(pos))) { m_file_pos = pos; highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; @@ -81,8 +81,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { m_buffer_reader.emplace(m_buffer.get(), 0); m_buffer_begin_pos = pos; } else { - auto data_size = m_buffer_reader->get_buffer_size(); - size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + data_size); + size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + m_buffer_reader->get_buffer_size()); size_t num_bytes_refilled {0}; auto error_code = refill_reader_buffer(num_bytes_to_refill, num_bytes_refilled); @@ -93,7 +92,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (ErrorCode_Success != error_code) { return error_code; } - m_buffer_reader->seek_from_begin(pos - m_buffer_begin_pos); + m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(pos)); } } m_file_pos = pos; @@ -252,7 +251,7 @@ size_t BufferedFileReader::set_checkpoint() { if (m_buffer_reader->get_buffer_size() != m_buffer_size) { // allocate new buffer for buffered data starting from pos resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); + m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(m_file_pos)); } } } @@ -270,8 +269,7 @@ void BufferedFileReader::clear_checkpoint () { } m_file_pos = highest_read_pos; - const auto copy_pos = m_file_pos - m_buffer_begin_pos; - resize_buffer_from_pos(copy_pos); + resize_buffer_from_pos(get_equivalent_buffer_pos(m_file_pos)); m_checkpoint_pos.reset(); } @@ -354,7 +352,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill, } m_buffer = std::move(new_buffer); m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); - m_buffer_reader->seek_from_begin(m_file_pos - m_buffer_begin_pos); + m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(m_file_pos)); } return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index b8d7295c2..7c0fc9c0f 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -218,6 +218,8 @@ class BufferedFileReader : public ReaderInterface { void resize_buffer_from_pos(size_t pos); + [[nodiscard]] size_t get_equivalent_buffer_pos(size_t file_pos) const { return file_pos - m_buffer_begin_pos; } + // Constants static constexpr size_t cDefaultBufferSize = 65536; From eae3caef43bc00e4e45b6395bf65e511f5276343 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 20:20:17 -0400 Subject: [PATCH 050/121] some other small refactoring --- components/core/src/BufferedFileReader.cpp | 44 ++++++++++++---------- components/core/src/BufferedFileReader.hpp | 12 +++++- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 07f83ac3f..d1334d7a7 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -64,9 +64,9 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Failure; } // adjust the buffer reader pos - m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(pos)); + m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); } else { - if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(get_equivalent_buffer_pos(pos))) { + if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { m_file_pos = pos; highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; @@ -92,7 +92,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (ErrorCode_Success != error_code) { return error_code; } - m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(pos)); + m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); } } m_file_pos = pos; @@ -229,20 +229,6 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { return ErrorCode_Success; } -void BufferedFileReader::resize_buffer_from_pos (size_t pos) { - - const auto copy_size = m_buffer_reader->get_buffer_size() - pos; - // Use a quantized size for the underlying buffer size - auto new_buffer_size = quantize_to_buffer_size(copy_size); - - auto new_buffer = make_unique(new_buffer_size); - memcpy(new_buffer.get(), &m_buffer[pos], copy_size); - m_buffer = std::move(new_buffer); - m_buffer_begin_pos += pos; - - m_buffer_reader.emplace(m_buffer.get(), copy_size); -} - size_t BufferedFileReader::set_checkpoint() { if (m_checkpoint_pos.has_value()) { if (m_checkpoint_pos > m_file_pos) { @@ -251,7 +237,7 @@ size_t BufferedFileReader::set_checkpoint() { if (m_buffer_reader->get_buffer_size() != m_buffer_size) { // allocate new buffer for buffered data starting from pos resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(m_file_pos)); + m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); } } } @@ -269,7 +255,7 @@ void BufferedFileReader::clear_checkpoint () { } m_file_pos = highest_read_pos; - resize_buffer_from_pos(get_equivalent_buffer_pos(m_file_pos)); + resize_buffer_from_pos(get_corresponding_offset(m_file_pos)); m_checkpoint_pos.reset(); } @@ -352,11 +338,29 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill, } m_buffer = std::move(new_buffer); m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); - m_buffer_reader->seek_from_begin(get_equivalent_buffer_pos(m_file_pos)); + m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); } return ErrorCode_Success; } +void BufferedFileReader::resize_buffer_from_pos (size_t pos) { + + const auto copy_size = m_buffer_reader->get_buffer_size() - pos; + // Use a quantized size for the underlying buffer size + auto new_buffer_size = quantize_to_buffer_size(copy_size); + + auto new_buffer = make_unique(new_buffer_size); + memcpy(new_buffer.get(), &m_buffer[pos], copy_size); + m_buffer = std::move(new_buffer); + m_buffer_begin_pos += pos; + + m_buffer_reader.emplace(m_buffer.get(), copy_size); +} + +size_t BufferedFileReader::get_corresponding_offset (size_t file_pos) const { + return file_pos - m_buffer_begin_pos; +} + static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 7c0fc9c0f..fcf1a531f 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -216,9 +216,19 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] ErrorCode refill_reader_buffer(size_t num_bytes_to_refill, size_t& num_bytes_refilled); + /** + * Resize the internal reader buffer by "dropping" all data before pos + * offset in the buffer + * @param pos + */ void resize_buffer_from_pos(size_t pos); - [[nodiscard]] size_t get_equivalent_buffer_pos(size_t file_pos) const { return file_pos - m_buffer_begin_pos; } + /** + * return the file_pos's corresponding offset in the internal buffer + * @param file_pos + * @return + */ + [[nodiscard]] size_t get_corresponding_offset(size_t file_pos) const; // Constants static constexpr size_t cDefaultBufferSize = 65536; From 8f63dc2ae322f64bcf5f31f60ba29e2191bc2e06 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 20:56:40 -0400 Subject: [PATCH 051/121] Simplify away set buffer_size. --- components/core/src/BufferedFileReader.cpp | 54 ++++++---------------- components/core/src/BufferedFileReader.hpp | 13 +----- 2 files changed, 15 insertions(+), 52 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index d1334d7a7..169adeb2c 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -21,15 +21,16 @@ using std::string; static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read); -BufferedFileReader::BufferedFileReader () { +BufferedFileReader::BufferedFileReader () : BufferedFileReader(cDefaultBufferSize) {} + +BufferedFileReader::BufferedFileReader (size_t buffer_size) { m_file_pos = 0; m_fd = -1; m_checkpoint_pos.reset(); - if (auto error_code = set_buffer_size(cDefaultBufferSize); - ErrorCode_Success != error_code) { - SPDLOG_ERROR("Failed to init reader buffer size to be {}", cDefaultBufferSize); - throw OperationFailed(error_code, __FILENAME__, __LINE__); + if (buffer_size % 4096 != 0) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } + m_buffer_size = buffer_size; m_buffer = make_unique(m_buffer_size); } @@ -115,17 +116,16 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, size_t bytes_read {0}; auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; auto error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); - // here EOF is allowed because it simply means we have exhausted the - // buffer, but not necessarily the file itself - if (ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { + if (ErrorCode_Success == error_code) { + num_bytes_read += bytes_read; + m_file_pos += bytes_read; + if (num_bytes_read == num_bytes_to_read) { + break; + } + } else if (ErrorCode_EndOfFile != error_code) { return error_code; } - num_bytes_read += bytes_read; - m_file_pos += bytes_read; - if (num_bytes_read == num_bytes_to_read) { - break; - } // refill the buffer if more bytes are to be read error_code = refill_reader_buffer(m_buffer_size); if (ErrorCode_EndOfFile == error_code) { @@ -259,32 +259,6 @@ void BufferedFileReader::clear_checkpoint () { m_checkpoint_pos.reset(); } -ErrorCode BufferedFileReader::set_buffer_size (size_t buffer_size) { - if (m_fd != -1) { - SPDLOG_ERROR("Buffer size can not be changed when the file is open"); - return ErrorCode_Failure; - } - if (buffer_size == 0) { - SPDLOG_ERROR("Buffer size can not be set to 0"); - return ErrorCode_BadParam; - } - if (buffer_size % 4096 != 0) { - SPDLOG_ERROR("Buffer size {} is not a multiple of page size", buffer_size); - return ErrorCode_BadParam; - } - // fast calculation to check if buffer_size is a power of 2 leveraged - // from https://stackoverflow.com/questions/51094594/ - // how-to-check-if-exactly-one-bit-is-set-in-an-int - if (false == (!(buffer_size & (buffer_size-1)))) { - SPDLOG_ERROR("Buffer size {} is not a power of 2", buffer_size); - return ErrorCode_BadParam; - } - - m_buffer_exp = static_cast(log2(static_cast(buffer_size))); - m_buffer_size = buffer_size; - return ErrorCode_Success; -} - ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const char*& data_ptr, size_t& peek_size) { if (-1 == m_fd) { @@ -304,7 +278,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha } size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { - return (1 + ((size - 1) >> m_buffer_exp)) << m_buffer_exp; + return (1 + ((size - 1) / m_buffer_size)) * m_buffer_size; } ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index fcf1a531f..8aaa2a1e1 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -40,6 +40,7 @@ class BufferedFileReader : public ReaderInterface { // Constructors BufferedFileReader(); + BufferedFileReader(size_t buffer_size); ~BufferedFileReader(); // Methods implementing the ReaderInterface @@ -122,17 +123,6 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; - /** - * Set the internal buffer - * @param buffer_size - * @return ErrorCode_Success on success - * @return ErrorCode_Failure if the file is not closed - * @return ErrorCode_BadParam if either: - * 1. buffer_size is not a integer multiple of 4096 - * 2. buffer_size is not a power of 2 - */ - [[nodiscard]] ErrorCode set_buffer_size(size_t buffer_size); - /** * Peeks the next peek_size bytes of data without advancing the file * pos. @@ -244,7 +234,6 @@ class BufferedFileReader : public ReaderInterface { size_t m_buffer_begin_pos; // Values for buffer related calculation - size_t m_buffer_exp; size_t m_buffer_size; // Variables for checkpoint support From 07767825d2ba89ac17d26e19186f747b2cb9c3a1 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:39:53 -0400 Subject: [PATCH 052/121] Simplify seek_from_begin --- components/core/src/BufferedFileReader.cpp | 66 ++++++++++++---------- components/core/src/BufferedFileReader.hpp | 2 + 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 169adeb2c..2f1cf0af8 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -54,47 +54,47 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } - if (pos <= m_file_pos) { - if (false == m_checkpoint_pos.has_value()) { - SPDLOG_ERROR("Error: Seek back when checkpoint is not enabled"); + if (m_checkpoint_pos.has_value() == false) { + if (pos < m_file_pos) { return ErrorCode_Failure; } + if (ErrorCode_Success == + m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))){ + m_file_pos = pos; + highest_read_pos = std::max(highest_read_pos, m_file_pos); + return ErrorCode_Success; + } + // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader + auto offset = lseek(m_fd, pos, SEEK_SET); + if (offset == -1) { + return ErrorCode_errno; + } + m_buffer_reader.emplace(m_buffer.get(), 0); + m_buffer_begin_pos = pos; + } else { if (pos < m_checkpoint_pos) { - SPDLOG_ERROR("Error: trying to seek to {} which is ahead of checkpoint: {}", - pos, m_checkpoint_pos.value()); return ErrorCode_Failure; + } else if (pos < m_file_pos) { + m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); } - // adjust the buffer reader pos - m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); - } else { - if (ErrorCode_Success == m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { + if (ErrorCode_Success == + m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { m_file_pos = pos; highest_read_pos = std::max(highest_read_pos, m_file_pos); return ErrorCode_Success; } - // Handle the case where buffer is empty or doesn't contain enough data for seek - if (false == m_checkpoint_pos.has_value()) { - // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader - auto offset = lseek(m_fd, pos, SEEK_SET); - if (offset == -1) { - return ErrorCode_errno; - } - m_buffer_reader.emplace(m_buffer.get(), 0); - m_buffer_begin_pos = pos; - } else { - size_t num_bytes_to_refill = pos - (m_buffer_begin_pos + m_buffer_reader->get_buffer_size()); - size_t num_bytes_refilled {0}; - auto error_code = refill_reader_buffer(num_bytes_to_refill, num_bytes_refilled); - if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); - throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); - } - if (ErrorCode_Success != error_code) { - return error_code; - } - m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); + size_t num_bytes_to_refill = pos - get_buffer_end_pos(); + size_t num_bytes_refilled{0}; + auto error_code = refill_reader_buffer(num_bytes_to_refill, num_bytes_refilled); + if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { + SPDLOG_ERROR("not expecting to seek pass the Entire file"); + throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); } + if (ErrorCode_Success != error_code) { + return error_code; + } + m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); } m_file_pos = pos; highest_read_pos = std::max(highest_read_pos, m_file_pos); @@ -249,7 +249,7 @@ void BufferedFileReader::clear_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } - const auto buffer_end_file_pos = m_buffer_reader->get_buffer_size() + m_buffer_begin_pos; + const auto buffer_end_file_pos = get_buffer_end_pos(); if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_buffer_size) { throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); } @@ -335,6 +335,10 @@ size_t BufferedFileReader::get_corresponding_offset (size_t file_pos) const { return file_pos - m_buffer_begin_pos; } +size_t BufferedFileReader::get_buffer_end_pos () const { + return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); +} + static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, size_t& num_bytes_read) { num_bytes_read = 0; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 8aaa2a1e1..633aa3b97 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -220,6 +220,8 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] size_t get_corresponding_offset(size_t file_pos) const; + [[nodiscard]] size_t get_buffer_end_pos() const; + // Constants static constexpr size_t cDefaultBufferSize = 65536; From 41b9bced45f7f053018cdbda607dcfd50672e246 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:42:50 -0400 Subject: [PATCH 053/121] Maybe unnecessary simplification --- components/core/src/BufferedFileReader.cpp | 20 +++++++------------- components/core/src/BufferedFileReader.hpp | 10 ---------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 2f1cf0af8..7d1f3c292 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -85,16 +85,16 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } size_t num_bytes_to_refill = pos - get_buffer_end_pos(); - size_t num_bytes_refilled{0}; - auto error_code = refill_reader_buffer(num_bytes_to_refill, num_bytes_refilled); - if (ErrorCode_EndOfFile == error_code || num_bytes_refilled < num_bytes_to_refill) { - SPDLOG_ERROR("not expecting to seek pass the Entire file"); + auto error_code = refill_reader_buffer(num_bytes_to_refill); + if (ErrorCode_EndOfFile == error_code) { throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); } if (ErrorCode_Success != error_code) { return error_code; } - m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); + if (ErrorCode_Success != m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { + throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); + } } m_file_pos = pos; highest_read_pos = std::max(highest_read_pos, m_file_pos); @@ -281,14 +281,8 @@ size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { return (1 + ((size - 1) / m_buffer_size)) * m_buffer_size; } -ErrorCode BufferedFileReader::refill_reader_buffer (size_t refill_size) { - size_t num_bytes_refilled; - return refill_reader_buffer (refill_size, num_bytes_refilled); -} - -ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill, - size_t& num_bytes_refilled) { - num_bytes_refilled = 0; +ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) { + size_t num_bytes_refilled = 0; const auto quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); if (false == m_checkpoint_pos.has_value()) { auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 633aa3b97..2400b9315 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -196,16 +196,6 @@ class BufferedFileReader : public ReaderInterface { */ [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); - /** - * Similar to refill_reader_buffer, except that number of bytes refilled - * is returned by reference - * @param num_bytes_to_refill - * @param num_bytes_refilled Returns the number of bytes refilled by - * reference - * @return Same as refill_reader_buffer(size_t refill_size) - */ - [[nodiscard]] ErrorCode refill_reader_buffer(size_t num_bytes_to_refill, size_t& num_bytes_refilled); - /** * Resize the internal reader buffer by "dropping" all data before pos * offset in the buffer From 0e6e00c77379760e9f035b831ce1b3c5238e89cb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:46:47 -0400 Subject: [PATCH 054/121] simplify BufferReader --- components/core/src/BufferedFileReader.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 7d1f3c292..701cd07d8 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -109,17 +109,20 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, if (nullptr == buf) { return ErrorCode_BadParam; } + if (num_bytes_to_read == 0) { + return ErrorCode_Success; + } num_bytes_read = 0; - // keep reading until enough data is read or an eof is seen while (true) { size_t bytes_read {0}; - auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; - auto error_code = m_buffer_reader->try_read(buf + num_bytes_read, remaining_bytes_to_read, bytes_read); + auto error_code = m_buffer_reader->try_read(buf, num_bytes_to_read, bytes_read); if (ErrorCode_Success == error_code) { + buf += bytes_read; num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; m_file_pos += bytes_read; - if (num_bytes_read == num_bytes_to_read) { + if (0 == num_bytes_to_read) { break; } } else if (ErrorCode_EndOfFile != error_code) { From ff0d016b05be57a78716fa86d5af6fad09d1dc5e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 31 Jul 2023 19:03:41 -0400 Subject: [PATCH 055/121] optimize for buffer alignment --- components/core/src/BufferedFileReader.cpp | 26 +++++++++++++++++----- components/core/src/BufferedFileReader.hpp | 1 - 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 701cd07d8..8c8001a94 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -252,6 +252,9 @@ void BufferedFileReader::clear_checkpoint () { if (false == m_checkpoint_pos.has_value()) { return; } + + // TODO: a check to make sure that highest_read_pos should always be in + // the same default_buffer_size range with buffer_end_file_pos const auto buffer_end_file_pos = get_buffer_end_pos(); if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_buffer_size) { throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); @@ -281,35 +284,46 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha } size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { + if (size == 0) { + return 0; + } return (1 + ((size - 1) / m_buffer_size)) * m_buffer_size; } ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) { size_t num_bytes_refilled = 0; - const auto quantized_refill_size = quantize_to_buffer_size(num_bytes_to_refill); + + const auto buffer_end_pos = get_buffer_end_pos(); + size_t num_bytes_alignment = m_buffer_size - (buffer_end_pos % m_buffer_size); + if (false == m_checkpoint_pos.has_value()) { auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), - quantized_refill_size, num_bytes_refilled); + num_bytes_alignment, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } + m_buffer_begin_pos = buffer_end_pos; m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); - m_buffer_begin_pos = m_file_pos; } else { + while (num_bytes_alignment < num_bytes_to_refill) { + num_bytes_alignment += m_buffer_size; + } // Messy way of copying data from old buffer to new buffer auto data_size = m_buffer_reader->get_buffer_size(); - auto new_buffer = make_unique(data_size + quantized_refill_size); + const auto new_buffer_size = data_size + num_bytes_alignment; + auto new_buffer = make_unique(new_buffer_size); memcpy(new_buffer.get(), m_buffer.get(), data_size); // Read data to the new buffer, with offset = data_size - auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], quantized_refill_size, + auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], num_bytes_alignment, num_bytes_refilled); if (error_code != ErrorCode_Success) { return error_code; } m_buffer = std::move(new_buffer); + const auto prev_pos = m_buffer_reader->get_pos(); m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); - m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); + m_buffer_reader->seek_from_begin(prev_pos); } return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 2400b9315..0c1e758d8 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -227,7 +227,6 @@ class BufferedFileReader : public ReaderInterface { // Values for buffer related calculation size_t m_buffer_size; - // Variables for checkpoint support std::optional m_checkpoint_pos; size_t highest_read_pos {0}; From ca8c05ad45d4547288019e0e7a1ff2081e410982 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 31 Jul 2023 21:40:59 -0400 Subject: [PATCH 056/121] Handle buffer combining case --- components/core/src/BufferedFileReader.cpp | 53 ++++++++++++++-------- components/core/src/BufferedFileReader.hpp | 3 +- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 8c8001a94..1bc469c4e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -23,14 +23,15 @@ static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_ BufferedFileReader::BufferedFileReader () : BufferedFileReader(cDefaultBufferSize) {} -BufferedFileReader::BufferedFileReader (size_t buffer_size) { +BufferedFileReader::BufferedFileReader (size_t base_buffer_size) { m_file_pos = 0; m_fd = -1; m_checkpoint_pos.reset(); - if (buffer_size % 4096 != 0) { + if (base_buffer_size % 4096 != 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - m_buffer_size = buffer_size; + m_base_buffer_size = base_buffer_size; + m_buffer_size = m_base_buffer_size; m_buffer = make_unique(m_buffer_size); } @@ -130,7 +131,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, } // refill the buffer if more bytes are to be read - error_code = refill_reader_buffer(m_buffer_size); + error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { break; } else if (ErrorCode_Success != error_code) { @@ -161,7 +162,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim total_append_length += length; if (false == found_delim) { - auto error_code = refill_reader_buffer(m_buffer_size); + auto error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { if (total_append_length == 0) { return ErrorCode_EndOfFile; @@ -214,6 +215,7 @@ void BufferedFileReader::close () { m_fd = -1; if (m_checkpoint_pos.has_value()) { + m_buffer_size = m_base_buffer_size; m_buffer = make_unique(m_buffer_size); m_checkpoint_pos.reset(); } @@ -237,7 +239,7 @@ size_t BufferedFileReader::set_checkpoint() { if (m_checkpoint_pos > m_file_pos) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } else if (m_checkpoint_pos < m_file_pos) { - if (m_buffer_reader->get_buffer_size() != m_buffer_size) { + if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { // allocate new buffer for buffered data starting from pos resize_buffer_from_pos(m_buffer_reader->get_pos()); m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); @@ -256,7 +258,7 @@ void BufferedFileReader::clear_checkpoint () { // TODO: a check to make sure that highest_read_pos should always be in // the same default_buffer_size range with buffer_end_file_pos const auto buffer_end_file_pos = get_buffer_end_pos(); - if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_buffer_size) { + if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_base_buffer_size) { throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); } @@ -272,7 +274,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha } // Refill the buffer if it is not loaded yet if (false == m_buffer_reader.has_value()) { - auto error_code = refill_reader_buffer(m_buffer_size); + auto error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_Success != error_code) { data_ptr = nullptr; peek_size = 0; @@ -287,14 +289,32 @@ size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { if (size == 0) { return 0; } - return (1 + ((size - 1) / m_buffer_size)) * m_buffer_size; + return (1 + ((size - 1) / m_base_buffer_size)) * m_base_buffer_size; } ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) { size_t num_bytes_refilled = 0; const auto buffer_end_pos = get_buffer_end_pos(); - size_t num_bytes_alignment = m_buffer_size - (buffer_end_pos % m_buffer_size); + const auto data_size = m_buffer_reader->get_buffer_size(); + + size_t num_bytes_alignment = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); + if (m_checkpoint_pos.has_value()) { + while (num_bytes_alignment < num_bytes_to_refill) { + num_bytes_alignment += m_base_buffer_size; + } + } + // Don't extend the underlying buffer if enough space is available + if (num_bytes_alignment < m_buffer_size - data_size) { + auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + data_size, + num_bytes_alignment, num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + data_size); + m_buffer_reader->seek_from_begin(data_size); + return ErrorCode_Success; + } if (false == m_checkpoint_pos.has_value()) { auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), @@ -305,13 +325,9 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) m_buffer_begin_pos = buffer_end_pos; m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); } else { - while (num_bytes_alignment < num_bytes_to_refill) { - num_bytes_alignment += m_buffer_size; - } // Messy way of copying data from old buffer to new buffer - auto data_size = m_buffer_reader->get_buffer_size(); - const auto new_buffer_size = data_size + num_bytes_alignment; - auto new_buffer = make_unique(new_buffer_size); + m_buffer_size = data_size + num_bytes_alignment; + auto new_buffer = make_unique(m_buffer_size); memcpy(new_buffer.get(), m_buffer.get(), data_size); // Read data to the new buffer, with offset = data_size @@ -332,9 +348,8 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { const auto copy_size = m_buffer_reader->get_buffer_size() - pos; // Use a quantized size for the underlying buffer size - auto new_buffer_size = quantize_to_buffer_size(copy_size); - - auto new_buffer = make_unique(new_buffer_size); + m_buffer_size = quantize_to_buffer_size(copy_size); + auto new_buffer = make_unique(m_buffer_size); memcpy(new_buffer.get(), &m_buffer[pos], copy_size); m_buffer = std::move(new_buffer); m_buffer_begin_pos += pos; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 0c1e758d8..48fed39fc 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -40,7 +40,7 @@ class BufferedFileReader : public ReaderInterface { // Constructors BufferedFileReader(); - BufferedFileReader(size_t buffer_size); + BufferedFileReader(size_t base_buffer_size); ~BufferedFileReader(); // Methods implementing the ReaderInterface @@ -226,6 +226,7 @@ class BufferedFileReader : public ReaderInterface { size_t m_buffer_begin_pos; // Values for buffer related calculation + size_t m_base_buffer_size; size_t m_buffer_size; // Variables for checkpoint support std::optional m_checkpoint_pos; From 84316d74fc222e78a2ce6bfcd1a171f218e433b7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 31 Jul 2023 21:58:52 -0400 Subject: [PATCH 057/121] Small fixes --- components/core/src/BufferedFileReader.cpp | 29 +++++++++------------- components/core/src/BufferedFileReader.hpp | 8 +++--- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 1bc469c4e..1bfa1ae58 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -235,15 +235,11 @@ ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { } size_t BufferedFileReader::set_checkpoint() { - if (m_checkpoint_pos.has_value()) { - if (m_checkpoint_pos > m_file_pos) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } else if (m_checkpoint_pos < m_file_pos) { - if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { - // allocate new buffer for buffered data starting from pos - resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); - } + if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos) { + if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { + // allocate new buffer for buffered data starting from pos + resize_buffer_from_pos(m_buffer_reader->get_pos()); + m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); } } m_checkpoint_pos = m_file_pos; @@ -255,13 +251,6 @@ void BufferedFileReader::clear_checkpoint () { return; } - // TODO: a check to make sure that highest_read_pos should always be in - // the same default_buffer_size range with buffer_end_file_pos - const auto buffer_end_file_pos = get_buffer_end_pos(); - if (buffer_end_file_pos <= highest_read_pos || buffer_end_file_pos - highest_read_pos > m_base_buffer_size) { - throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); - } - m_file_pos = highest_read_pos; resize_buffer_from_pos(get_corresponding_offset(m_file_pos)); m_checkpoint_pos.reset(); @@ -285,7 +274,7 @@ ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const cha return ErrorCode_Success; } -size_t BufferedFileReader::quantize_to_buffer_size (size_t size) { +size_t BufferedFileReader::quantize_to_buffer_size (size_t size) const { if (size == 0) { return 0; } @@ -345,6 +334,9 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) } void BufferedFileReader::resize_buffer_from_pos (size_t pos) { + if (pos > m_buffer_reader->get_buffer_size()) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } const auto copy_size = m_buffer_reader->get_buffer_size() - pos; // Use a quantized size for the underlying buffer size @@ -358,6 +350,9 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { } size_t BufferedFileReader::get_corresponding_offset (size_t file_pos) const { + if (file_pos < m_buffer_begin_pos) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } return file_pos - m_buffer_begin_pos; } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 48fed39fc..9bbcf497e 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -181,7 +181,7 @@ class BufferedFileReader : public ReaderInterface { * @param size * @return quantized size */ - [[nodiscard]] size_t quantize_to_buffer_size(size_t size); + [[nodiscard]] size_t quantize_to_buffer_size(size_t size) const; /** * Reads next refill_size bytes from file descriptor to the internal buffer @@ -197,14 +197,14 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); /** - * Resize the internal reader buffer by "dropping" all data before pos - * offset in the buffer + * Resize the internal reader buffer and copy over data from the original + * buffer staring from pos to the beginning of the resized the buffer * @param pos */ void resize_buffer_from_pos(size_t pos); /** - * return the file_pos's corresponding offset in the internal buffer + * return the file_pos's corresponding pos in the internal buffer * @param file_pos * @return */ From 12ee6872adbc3bed745485e10bf226a6fa3424bb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 7 Aug 2023 22:48:14 -0400 Subject: [PATCH 058/121] First round of fixes --- components/core/CMakeLists.txt | 4 +- components/core/src/BufferReader.cpp | 19 ++-- components/core/src/BufferReader.hpp | 4 +- components/core/src/BufferedFileReader.cpp | 123 ++++++++------------- components/core/src/BufferedFileReader.hpp | 42 +++---- components/core/src/clp/FileCompressor.cpp | 3 +- 6 files changed, 76 insertions(+), 119 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 262ce172d..c38aee0af 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -632,10 +632,10 @@ target_compile_features(clo ) set(SOURCE_FILES_unitTest - src/BufferReader.cpp - src/BufferReader.hpp src/BufferedFileReader.cpp src/BufferedFileReader.hpp + src/BufferReader.cpp + src/BufferReader.hpp src/clp/CommandLineArguments.cpp src/clp/CommandLineArguments.hpp src/clp/compression.cpp diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 58afb7ac2..460d110e0 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -3,7 +3,6 @@ // C++ standard libraries #include -#include #include BufferReader::BufferReader (const char* data, size_t data_size) { @@ -16,7 +15,7 @@ BufferReader::BufferReader (const char* data, size_t data_size) { ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (nullptr == buf && num_bytes_to_read > 0) { - return ErrorCode_BadParam; + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } auto remaining_data_size = m_internal_buf_size - m_internal_buf_pos; @@ -45,18 +44,18 @@ ErrorCode BufferReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } -void BufferReader::peek_buffer (size_t size_to_peek, const char*& data_ptr, size_t& peek_size) { - peek_size = std::min(size_to_peek, m_internal_buf_size - m_internal_buf_pos); - data_ptr = m_internal_buf + m_internal_buf_pos; +void BufferReader::peek_buffer (const char*& buf, size_t& peek_size) { + peek_size = m_internal_buf_size - m_internal_buf_pos; + buf = m_internal_buf + m_internal_buf_pos; } ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, - std::string& str, size_t& length) { + std::string& str, size_t& num_bytes_read) { if (false == append) { str.clear(); } - // find the pointer pointing to the delimiter + // Find the delimiter const char* buffer_head = m_internal_buf + m_internal_buf_pos; const char* delim_ptr = reinterpret_cast( memchr(buffer_head, delim, m_internal_buf_size - m_internal_buf_pos) @@ -71,11 +70,11 @@ ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, ret_code = ErrorCode_EndOfFile; } // append to strings - length = delim_pos - m_internal_buf_pos; + num_bytes_read = delim_pos - m_internal_buf_pos; if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { - --length; + --num_bytes_read; } - str.append(buffer_head, length); + str.append(buffer_head, num_bytes_read); m_internal_buf_pos = delim_pos; return ret_code; } \ No newline at end of file diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 66004b0f3..1fd821a62 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -53,10 +53,10 @@ class BufferReader : public ReaderInterface { // Helper functions [[nodiscard]] size_t get_buffer_size() const { return m_internal_buf_size; } - void peek_buffer (size_t size_to_peek, const char*& data_ptr, size_t& peek_size); + void peek_buffer (const char*& buf, size_t& peek_size); ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, std::string& str, size_t& length); + bool append, std::string& str, size_t& num_bytes_read); private: const char* m_internal_buf; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 1bfa1ae58..fe1dd32eb 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -18,15 +18,33 @@ using std::make_unique; using std::move; using std::string; -static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, - size_t& num_bytes_read); - -BufferedFileReader::BufferedFileReader () : BufferedFileReader(cDefaultBufferSize) {} +namespace { + ErrorCode try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, + size_t& num_bytes_read) { + num_bytes_read = 0; + while (true) { + const auto bytes_read = ::read(fd, buf, num_bytes_to_read); + if (bytes_read > 0) { + buf += bytes_read; + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_read == num_bytes_to_read) { + return ErrorCode_Success; + } + } else if (0 == bytes_read) { + break; + } else { + return ErrorCode_errno; + } + } + if (num_bytes_read == 0) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; + } +} BufferedFileReader::BufferedFileReader (size_t base_buffer_size) { - m_file_pos = 0; - m_fd = -1; - m_checkpoint_pos.reset(); if (base_buffer_size % 4096 != 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -48,7 +66,7 @@ ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { } ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { - if (m_fd == -1) { + if (-1 == m_fd) { return ErrorCode_NotInit; } if (pos == m_file_pos) { @@ -60,14 +78,14 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Failure; } if (ErrorCode_Success == - m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))){ + m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))){ m_file_pos = pos; - highest_read_pos = std::max(highest_read_pos, m_file_pos); + m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader - auto offset = lseek(m_fd, pos, SEEK_SET); - if (offset == -1) { + auto offset = lseek(m_fd, static_cast<__off64_t>(pos), SEEK_SET); + if (-1 == offset) { return ErrorCode_errno; } m_buffer_reader.emplace(m_buffer.get(), 0); @@ -75,13 +93,11 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } else { if (pos < m_checkpoint_pos) { return ErrorCode_Failure; - } else if (pos < m_file_pos) { - m_buffer_reader->seek_from_begin(get_corresponding_offset(pos)); } if (ErrorCode_Success == - m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { + m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))) { m_file_pos = pos; - highest_read_pos = std::max(highest_read_pos, m_file_pos); + m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -93,12 +109,12 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { if (ErrorCode_Success != error_code) { return error_code; } - if (ErrorCode_Success != m_buffer_reader->try_seek_from_begin(get_corresponding_offset(pos))) { + if (ErrorCode_Success != m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))) { throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); } } m_file_pos = pos; - highest_read_pos = std::max(highest_read_pos, m_file_pos); + m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -130,7 +146,6 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, return error_code; } - // refill the buffer if more bytes are to be read error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { break; @@ -138,10 +153,10 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, return error_code; } } - if (num_bytes_read == 0) { + if (0 == num_bytes_read) { return ErrorCode_EndOfFile; } - highest_read_pos = std::max(highest_read_pos, m_file_pos); + m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -173,7 +188,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } } } - highest_read_pos = std::max(highest_read_pos, m_file_pos); + m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -192,6 +207,7 @@ ErrorCode BufferedFileReader::try_open (const string& path) { m_file_pos = 0; m_buffer_begin_pos = 0; m_buffer_reader.emplace(m_buffer.get(), 0); + m_highest_read_pos = 0; return ErrorCode_Success; } @@ -222,24 +238,12 @@ void BufferedFileReader::close () { } } -ErrorCode BufferedFileReader::try_fstat (struct stat& stat_buffer) const { - if (-1 == m_fd) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - auto return_value = fstat(m_fd, &stat_buffer); - if (0 != return_value) { - return ErrorCode_errno; - } - return ErrorCode_Success; -} - size_t BufferedFileReader::set_checkpoint() { if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos) { if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { // allocate new buffer for buffered data starting from pos resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(get_corresponding_offset(m_file_pos)); + m_buffer_reader->seek_from_begin(get_buffer_relative_pos(m_file_pos)); } } m_checkpoint_pos = m_file_pos; @@ -251,26 +255,26 @@ void BufferedFileReader::clear_checkpoint () { return; } - m_file_pos = highest_read_pos; - resize_buffer_from_pos(get_corresponding_offset(m_file_pos)); + m_file_pos = m_highest_read_pos; + resize_buffer_from_pos(get_buffer_relative_pos(m_file_pos)); m_checkpoint_pos.reset(); } -ErrorCode BufferedFileReader::peek_buffered_data (size_t size_to_peek, const char*& data_ptr, +ErrorCode BufferedFileReader::peek_buffered_data (const char*& buf, size_t& peek_size) { if (-1 == m_fd) { return ErrorCode_NotInit; } // Refill the buffer if it is not loaded yet - if (false == m_buffer_reader.has_value()) { + if (0 == m_buffer_reader->get_buffer_size()) { auto error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_Success != error_code) { - data_ptr = nullptr; + buf = nullptr; peek_size = 0; return error_code; } } - m_buffer_reader->peek_buffer(size_to_peek, data_ptr, peek_size); + m_buffer_reader->peek_buffer(buf, peek_size); return ErrorCode_Success; } @@ -347,39 +351,4 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { m_buffer_begin_pos += pos; m_buffer_reader.emplace(m_buffer.get(), copy_size); -} - -size_t BufferedFileReader::get_corresponding_offset (size_t file_pos) const { - if (file_pos < m_buffer_begin_pos) { - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); - } - return file_pos - m_buffer_begin_pos; -} - -size_t BufferedFileReader::get_buffer_end_pos () const { - return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); -} - -static ErrorCode try_read_into_buffer(int fd, char* buffer, size_t num_bytes_to_read, - size_t& num_bytes_read) { - num_bytes_read = 0; - // keep reading from the fd until enough bytes are read - while (true) { - auto remaining_bytes_to_read = num_bytes_to_read - num_bytes_read; - auto bytes_read = ::read(fd, buffer + num_bytes_read, remaining_bytes_to_read); - if (bytes_read == -1) { - return ErrorCode_errno; - } - if (bytes_read == 0) { - break; - } - num_bytes_read += bytes_read; - if (num_bytes_read == num_bytes_to_read) { - return ErrorCode_Success; - } - } - if (num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; -} +} \ No newline at end of file diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 9bbcf497e..2ebe58da4 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -1,5 +1,5 @@ -#ifndef BufferedFileReader_HPP -#define BufferedFileReader_HPP +#ifndef BUFFEREDFILEREADER_HPP +#define BUFFEREDFILEREADER_HPP // C standard libraries @@ -39,8 +39,8 @@ class BufferedFileReader : public ReaderInterface { }; // Constructors - BufferedFileReader(); BufferedFileReader(size_t base_buffer_size); + BufferedFileReader() : BufferedFileReader(cDefaultBufferSize) {} ~BufferedFileReader(); // Methods implementing the ReaderInterface @@ -116,29 +116,19 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] const std::string& get_path () const { return m_path; } /** - * Tries to stat the current file - * @param stat_buffer - * @return ErrorCode_errno on error - * @return ErrorCode_Success on success - */ - [[nodiscard]] ErrorCode try_fstat (struct stat& stat_buffer) const; - - /** - * Peeks the next peek_size bytes of data without advancing the file + * Peeks the buffer without advancing the file * pos. * Note: If further operation such as read or peek is called on the - * BufferedFileReader after peek_buffered_data, the data_ptr could + * BufferedFileReader after peek_buffered_data, the buf could * point to invalid data - * @param size_to_peek - * @param data_ptr pointer pointing to peeked data + * @param buf pointer pointing to peeked data * @param peek_size returns number of bytes peeked by reference * @return ErrorCode_Success on success * @return ErrorCode_errno on error * @return ErrorCode_NotInit if the file is not opened * @return ErrorCode_EndOfFile if already reaching the eof */ - [[nodiscard]] ErrorCode peek_buffered_data(size_t size_to_peek, const char*& data_ptr, - size_t& peek_size); + [[nodiscard]] ErrorCode peek_buffered_data(const char*& data_ptr, size_t& peek_size); /** * Sets a checkpoint at the current file pos. @@ -208,31 +198,31 @@ class BufferedFileReader : public ReaderInterface { * @param file_pos * @return */ - [[nodiscard]] size_t get_corresponding_offset(size_t file_pos) const; + [[nodiscard]] size_t get_buffer_relative_pos(size_t file_pos) const { return file_pos - m_buffer_begin_pos; } - [[nodiscard]] size_t get_buffer_end_pos() const; + [[nodiscard]] size_t get_buffer_end_pos() const { return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); } // Constants - static constexpr size_t cDefaultBufferSize = 65536; + static constexpr size_t cMinBufferSize = (1ULL << 12); + static constexpr size_t cDefaultBufferSize = (16 * cMinBufferSize); // Variables - int m_fd; + int m_fd{-1}; std::string m_path; - size_t m_file_pos; + size_t m_file_pos{0}; // Buffer specific data std::unique_ptr m_buffer; std::optional m_buffer_reader; - size_t m_buffer_begin_pos; + size_t m_buffer_begin_pos{0}; // Values for buffer related calculation size_t m_base_buffer_size; size_t m_buffer_size; // Variables for checkpoint support std::optional m_checkpoint_pos; - size_t highest_read_pos {0}; + size_t m_highest_read_pos{0}; }; - -#endif // BufferedFileReader_HPP +#endif // BUFFEREDFILEREADER_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index e81fbc1f7..4e3068d8e 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -90,8 +90,7 @@ namespace clp { m_file_reader.open(file_to_compress.get_path()); // Check that file is UTF-8 encoded - if (auto error_code = m_file_reader.peek_buffered_data(cUtf8ValidationBufCapacity, - m_utf8_validation_buf, + if (auto error_code = m_file_reader.peek_buffered_data(m_utf8_validation_buf, m_utf8_validation_buf_length); ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { SPDLOG_ERROR("Failed to peek data from {}, errno={}", From 787735004c3c7d32dd98da25450346df08060f45 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 7 Aug 2023 23:45:15 -0400 Subject: [PATCH 059/121] temporary test --- .../core/tests/test-BufferedFileReader.cpp | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 1b399157d..74f0e172e 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -205,3 +205,37 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { delete[] test_data; delete[] read_buffer; } + +#include "../src/FileReader.hpp" +TEST_CASE("Test delimiter", "[BufferedFileReader]") { + + // Initialize data for testing + size_t test_data_size = 1L * 1024 * 1024; // 1MB + char* test_data = new char[test_data_size]; + for (size_t i = 0; i < test_data_size; ++i) { + test_data[i] = (char)('a' + (i % 26)); + } + + std::string test_file_path {"BufferedFileReader.delimiter.test"}; + // write to test file + FileWriter file_writer; + file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); + file_writer.write(test_data, test_data_size); + file_writer.close(); + + BufferedFileReader buffered_file_reader; + buffered_file_reader.open(test_file_path); + std::string test_string; + + FileReader file_reader; + file_reader.open(test_file_path); + std::string ref_string; + + ErrorCode error_code = ErrorCode_Success; + while(ErrorCode_EndOfFile != error_code) { + error_code = file_reader.try_read_to_delimiter('n', true, false, ref_string); + auto error_code2 = buffered_file_reader.try_read_to_delimiter('n', true, false, test_string); + REQUIRE(error_code2 == error_code); + REQUIRE(test_string == ref_string); + } +} \ No newline at end of file From ca7f48ba6a9f278dbb65a7fc60d819069e7d88b2 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 7 Aug 2023 23:59:59 -0400 Subject: [PATCH 060/121] Fix and refactor for read_to_delim --- components/core/src/BufferReader.cpp | 50 +++++++++++++++------- components/core/src/BufferReader.hpp | 8 +++- components/core/src/BufferedFileReader.cpp | 9 ++-- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 460d110e0..a0bd42d8e 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -2,8 +2,11 @@ // C++ standard libraries #include +#include -#include +namespace { + +} BufferReader::BufferReader (const char* data, size_t data_size) { if (nullptr == data) { @@ -18,7 +21,7 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - auto remaining_data_size = m_internal_buf_size - m_internal_buf_pos; + auto remaining_data_size = get_remaining_data_size(); if (0 == remaining_data_size) { return ErrorCode_EndOfFile; } @@ -44,37 +47,52 @@ ErrorCode BufferReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } +ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, + std::string& str) { + bool found_delim; + size_t num_bytes_read; + if (false == append) { + str.clear(); + } + return try_read_to_delimiter(delim, keep_delimiter, str, found_delim, num_bytes_read); +} + void BufferReader::peek_buffer (const char*& buf, size_t& peek_size) { - peek_size = m_internal_buf_size - m_internal_buf_pos; + peek_size = get_remaining_data_size(); buf = m_internal_buf + m_internal_buf_pos; } -ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, - std::string& str, size_t& num_bytes_read) { +ErrorCode BufferReader::try_read_to_delimiter (char delim, + bool keep_delimiter, + std::string& str, + bool& found_delim, + size_t& num_bytes_read) { - if (false == append) { - str.clear(); + found_delim = false; + const auto remaining_data_size = get_remaining_data_size(); + if (0 == remaining_data_size) { + return ErrorCode_EndOfFile; } // Find the delimiter const char* buffer_head = m_internal_buf + m_internal_buf_pos; const char* delim_ptr = reinterpret_cast( - memchr(buffer_head, delim, m_internal_buf_size - m_internal_buf_pos) + memchr(buffer_head, delim, remaining_data_size) ); - ErrorCode ret_code; + size_t delim_pos; if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_internal_buf) + 1; - ret_code = ErrorCode_Success; + num_bytes_read = delim_pos - m_internal_buf_pos; + if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { + --num_bytes_read; + } + found_delim = true; } else { delim_pos = m_internal_buf_size; - ret_code = ErrorCode_EndOfFile; + num_bytes_read = remaining_data_size; } // append to strings - num_bytes_read = delim_pos - m_internal_buf_pos; - if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { - --num_bytes_read; - } str.append(buffer_head, num_bytes_read); m_internal_buf_pos = delim_pos; - return ret_code; + return ErrorCode_Success; } \ No newline at end of file diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 1fd821a62..81f245201 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -50,15 +50,19 @@ class BufferReader : public ReaderInterface { */ [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + [[nodiscard]] ErrorCode try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string &str) override; + // Helper functions [[nodiscard]] size_t get_buffer_size() const { return m_internal_buf_size; } void peek_buffer (const char*& buf, size_t& peek_size); - ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, std::string& str, size_t& num_bytes_read); + ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, std::string& str, bool& found_delim, size_t& num_bytes_read); private: + // Method + [[nodiscard]] size_t get_remaining_data_size() const { return m_internal_buf_size - m_internal_buf_pos; } + // Variables const char* m_internal_buf; size_t m_internal_buf_size; size_t m_internal_buf_pos{0}; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index fe1dd32eb..4df81d371 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -165,13 +165,16 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim if (-1 == m_fd) { return ErrorCode_NotInit; } - + if (false == append) { + str.clear(); + } bool found_delim {false}; size_t total_append_length {0}; while (false == found_delim) { size_t length {0}; - if (ErrorCode_Success == m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, append, str, length)) { - found_delim = true; + if (auto ret_code = m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, str, found_delim, length); + ret_code != ErrorCode_Success && ret_code != ErrorCode_EndOfFile) { + return ret_code; } m_file_pos += length; total_append_length += length; From f47613e1766c2eab262e12591eeb2380b265108c Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 00:33:13 -0400 Subject: [PATCH 061/121] small touch --- components/core/src/BufferReader.cpp | 3 ++- components/core/src/BufferReader.hpp | 5 +++-- components/core/src/BufferedFileReader.cpp | 10 ++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index a0bd42d8e..afbc3969c 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -8,12 +8,13 @@ namespace { } -BufferReader::BufferReader (const char* data, size_t data_size) { +BufferReader::BufferReader (const char* data, size_t data_size, size_t pos) { if (nullptr == data) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } m_internal_buf = data; m_internal_buf_size = data_size; + m_internal_buf_pos = pos; } ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 81f245201..68aca3b58 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -23,7 +23,8 @@ class BufferReader : public ReaderInterface { }; // Constructors - BufferReader (const char* data, size_t data_size); + BufferReader (const char* data, size_t data_size) : BufferReader(data, data_size, 0) {} + BufferReader (const char* data, size_t data_size, size_t pos); // Methods implementing the ReaderInterface /** @@ -65,7 +66,7 @@ class BufferReader : public ReaderInterface { // Variables const char* m_internal_buf; size_t m_internal_buf_size; - size_t m_internal_buf_pos{0}; + size_t m_internal_buf_pos; }; #endif // BUFFERREADER_HPP diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 4df81d371..10e9df3bb 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -37,7 +37,7 @@ namespace { return ErrorCode_errno; } } - if (num_bytes_read == 0) { + if (0 == num_bytes_read) { return ErrorCode_EndOfFile; } return ErrorCode_Success; @@ -45,7 +45,7 @@ namespace { } BufferedFileReader::BufferedFileReader (size_t base_buffer_size) { - if (base_buffer_size % 4096 != 0) { + if (base_buffer_size % cMinBufferSize != 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } m_base_buffer_size = base_buffer_size; @@ -307,8 +307,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) if (error_code != ErrorCode_Success) { return error_code; } - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + data_size); - m_buffer_reader->seek_from_begin(data_size); + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + data_size, data_size); return ErrorCode_Success; } @@ -334,8 +333,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) } m_buffer = std::move(new_buffer); const auto prev_pos = m_buffer_reader->get_pos(); - m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled); - m_buffer_reader->seek_from_begin(prev_pos); + m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled, prev_pos); } return ErrorCode_Success; } From 85f2fdccab4d2c6a84485377967d5eaea9706cb0 Mon Sep 17 00:00:00 2001 From: Haiqi <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 15:02:06 -0400 Subject: [PATCH 062/121] Further refactoring --- components/core/src/LibarchiveFileReader.cpp | 42 ++++++------------- components/core/src/LibarchiveFileReader.hpp | 13 +++--- components/core/src/LibarchiveReader.cpp | 44 ++++---------------- components/core/src/LibarchiveReader.hpp | 21 +++------- components/core/src/clp/FileCompressor.cpp | 5 +-- 5 files changed, 33 insertions(+), 92 deletions(-) diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 7158f15d5..96ee23934 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -167,8 +167,8 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter (char delim, bool keep_del return ErrorCode_Success; } -ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char*& data_ptr, - size_t& peek_size) { +ErrorCode LibarchiveFileReader::try_peek_data_block(const char*&buf, + size_t&buf_size) { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -180,43 +180,27 @@ ErrorCode LibarchiveFileReader::peek_data_block (size_t size_to_peek, const char if (nullptr == m_data_block) { auto error_code = read_next_data_block(); if (ErrorCode_Success != error_code) { - data_ptr = nullptr; - peek_size = 0; + buf = nullptr; + buf_size = 0; return error_code; } } if (m_data_block_pos_in_file <= m_pos_in_file) { - // No need to to simulate reading '\0' before the start of the data block - // simply return a const pointer to the current data block - peek_size = std::min(size_to_peek, m_data_block_length - m_pos_in_data_block); - data_ptr = reinterpret_cast(m_data_block); + // Position in the file is within the data block, so we can directly + // return a const pointer to the current data block + buf_size = m_data_block_length - m_pos_in_data_block; + buf = static_cast(m_data_block); return ErrorCode_Success; } - // There are sparse bytes before the data block, so the pos in data block - // must be 0 - assert(m_pos_in_data_block != 0); auto num_sparse_bytes = m_data_block_pos_in_file - m_pos_in_file; - peek_size = std::min(num_sparse_bytes + m_data_block_length, size_to_peek); + buf_size = num_sparse_bytes + m_data_block_length; + m_data_for_peek.resize(buf_size, '\0'); + buf = static_cast(m_data_for_peek.data()); - // resize the local buffer if necessary - if (m_data_for_peek.size() < peek_size) { - m_data_for_peek.resize(peek_size); - } - data_ptr = reinterpret_cast(m_data_for_peek.data()); - - if (size_to_peek < num_sparse_bytes) { - memset(m_data_for_peek.data(), '\0', size_to_peek); - return ErrorCode_Success; - } - - // Size to peek is greater than number of sparse bytes, - // copy over the data from data_block to the peek data buffer - memset(m_data_for_peek.data(), '\0', num_sparse_bytes); - size_t remaining_bytes_to_peek = peek_size - num_sparse_bytes; - const char* data = reinterpret_cast(m_data_block); - memcpy(&m_data_for_peek[num_sparse_bytes], data, remaining_bytes_to_peek); + size_t remaining_bytes_to_peek = buf_size - num_sparse_bytes; + memcpy(&m_data_for_peek[num_sparse_bytes], m_data_block, remaining_bytes_to_peek); return ErrorCode_Success; } diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index 11354fd26..f4544a7a3 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -72,17 +72,16 @@ class LibarchiveFileReader : public ReaderInterface { ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; /** - * Tries to peek up to a given number of bytes from the next data block - * if no enough data is available in the next data block, a smaller peek - * size will be returned - * @param size_to_peek - * @param data_ptr - * @param peek_size Return the number of bytes peeked by reference + * Tries to peek from the next data block and returns the available + * data size + * @param buf + * @param buf_size Returns the number of bytes in the buffer * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode peek_data_block(size_t size_to_peek, const char*& data_ptr, size_t& peek_size); + [[nodiscard]] ErrorCode try_peek_data_block(const char*&buf, size_t&buf_size); + // Methods /** * Opens the file reader diff --git a/components/core/src/LibarchiveReader.cpp b/components/core/src/LibarchiveReader.cpp index d993ec7a0..187a5e1bf 100644 --- a/components/core/src/LibarchiveReader.cpp +++ b/components/core/src/LibarchiveReader.cpp @@ -9,7 +9,7 @@ // Project headers #include "Defs.h" -ErrorCode LibarchiveReader::try_open (ReaderInterface& file_reader, const std::string& path_if_compressed_file) { +ErrorCode LibarchiveReader::try_open (ReaderInterface&reader, const std::string& path_if_compressed_file) { // Create and initialize internal libarchive m_archive = archive_read_new(); if (nullptr == m_archive) { @@ -34,12 +34,11 @@ ErrorCode LibarchiveReader::try_open (ReaderInterface& file_reader, const std::s throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - m_file_reader = &file_reader; + m_reader = &reader; m_filename_if_compressed = path_if_compressed_file; - return_value = archive_read_open2(m_archive, this, libarchive_open_callback, - libarchive_read_callback, libarchive_skip_callback, - libarchive_close_callback); + return_value = archive_read_open(m_archive, this, libarchive_open_callback, + libarchive_read_callback, libarchive_close_callback); if (ARCHIVE_OK != return_value) { SPDLOG_DEBUG("Failed to open libarchive - {}", archive_error_string(m_archive)); release_resources(); @@ -153,21 +152,6 @@ la_ssize_t LibarchiveReader::libarchive_read_callback (struct archive* archive, return num_bytes_read; } -la_int64_t LibarchiveReader::libarchive_skip_callback (struct archive* archive, void* client_data, off_t request) { - auto& libarchive_reader = *reinterpret_cast(client_data); - - size_t num_bytes_skipped; - auto error_code = libarchive_reader.libarchive_skip_callback(request, num_bytes_skipped); - if (ErrorCode_Success != error_code) { - if (ErrorCode_errno == error_code) { - archive_set_error(archive, errno, "Failed to skip."); - } - return ARCHIVE_FATAL; - } - - return num_bytes_skipped; -} - void LibarchiveReader::libarchive_open_callback () { m_is_opened_by_libarchive = true; } @@ -180,9 +164,10 @@ ErrorCode LibarchiveReader::libarchive_read_callback (const void** buffer, size_ if (false == m_is_opened_by_libarchive) { return ErrorCode_NotInit; } + constexpr size_t cTargetBufferLength = 4096; m_buffer.resize(cTargetBufferLength); - auto error_code = m_file_reader->try_read(m_buffer.data(), cTargetBufferLength, num_bytes_read); + auto error_code = m_reader->try_read(m_buffer.data(), cTargetBufferLength, num_bytes_read); if (ErrorCode_Success != error_code) { return error_code; } @@ -193,21 +178,6 @@ ErrorCode LibarchiveReader::libarchive_read_callback (const void** buffer, size_ return ErrorCode_Success; } -ErrorCode LibarchiveReader::libarchive_skip_callback (off_t num_bytes_to_skip, - size_t& num_bytes_skipped) { - - // skip bytes by simply reading data into a temporary buffer - std::vector temporary_read_buffer; - auto error_code = m_file_reader->try_read(temporary_read_buffer.data(), num_bytes_to_skip, - num_bytes_skipped); - if (ErrorCode_EndOfFile == error_code) { - num_bytes_skipped = 0; - } else if (ErrorCode_Success != error_code) { - return error_code; - } - return ErrorCode_Success; -} - void LibarchiveReader::release_resources () { auto return_value = archive_read_free(m_archive); if (ARCHIVE_OK != return_value) { @@ -215,6 +185,6 @@ void LibarchiveReader::release_resources () { } m_archive = nullptr; - m_file_reader = nullptr; + m_reader = nullptr; m_buffer.clear(); } diff --git a/components/core/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp index 1e583fca4..77a6a5dff 100644 --- a/components/core/src/LibarchiveReader.hpp +++ b/components/core/src/LibarchiveReader.hpp @@ -36,20 +36,19 @@ class LibarchiveReader { LibarchiveReader () : m_archive(nullptr), m_archive_entry(nullptr), - m_file_reader(nullptr), - m_initial_buffer_content_exhausted(false), + m_reader(nullptr), m_is_opened_by_libarchive(false) {} // Methods /** - * Tries to open the archive or compressed file contained in the FileReader - * @param file_reader + * Tries to open the archive or compressed file contained in the reader + * @param reader * @param path_if_compressed_file Path to use if the data is a single compressed file * @return ErrorCode_Success on success * @return ErrorCode_Failure on failure */ - ErrorCode try_open (ReaderInterface& file_reader, const std::string& path_if_compressed_file); + ErrorCode try_open (ReaderInterface&reader, const std::string& path_if_compressed_file); /** * Closes the reader */ @@ -109,15 +108,6 @@ class LibarchiveReader { * @return -1 on failure */ static la_ssize_t libarchive_read_callback (struct archive* archive, void* client_data, const void** buffer); - /** - * Callback for libarchive->skip - * @param archive - * @param client_data - * @param request - * @return Number of bytes skipped on success - * @return ARCHIVE_FATAL on failure - */ - static la_int64_t libarchive_skip_callback (struct archive* archive, void* client_data, off_t request); /** * Marks the archive opened by libarchive @@ -158,8 +148,7 @@ class LibarchiveReader { struct archive_entry* m_archive_entry; std::vector m_buffer; - ReaderInterface* m_file_reader; - bool m_initial_buffer_content_exhausted; + ReaderInterface*m_reader; std::string m_filename_if_compressed; diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index e81fbc1f7..45e6880dc 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -249,9 +249,8 @@ namespace clp { } m_libarchive_reader.open_file_reader(m_libarchive_file_reader); - error_code = m_libarchive_file_reader.peek_data_block(cUtf8ValidationBufCapacity, - m_utf8_validation_buf, - m_utf8_validation_buf_length); + error_code = m_libarchive_file_reader.try_peek_data_block( + m_utf8_validation_buf, m_utf8_validation_buf_length); // Check that file is UTF-8 encoded if (ErrorCode_Success != error_code) { if (ErrorCode_EndOfFile != error_code) { From 749e652a3b8dec15c52e3115c252771b8e160e34 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 17:54:54 -0400 Subject: [PATCH 063/121] small update to the temporary test --- components/core/tests/test-BufferedFileReader.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 74f0e172e..687e45bef 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -212,8 +212,9 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { // Initialize data for testing size_t test_data_size = 1L * 1024 * 1024; // 1MB char* test_data = new char[test_data_size]; + std::srand(0); for (size_t i = 0; i < test_data_size; ++i) { - test_data[i] = (char)('a' + (i % 26)); + test_data[i] = (char)('a' + (std::rand() % 26)); } std::string test_file_path {"BufferedFileReader.delimiter.test"}; @@ -232,9 +233,10 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { std::string ref_string; ErrorCode error_code = ErrorCode_Success; + char delimiter = (char)('a' + (std::rand() % 26)); while(ErrorCode_EndOfFile != error_code) { - error_code = file_reader.try_read_to_delimiter('n', true, false, ref_string); - auto error_code2 = buffered_file_reader.try_read_to_delimiter('n', true, false, test_string); + error_code = file_reader.try_read_to_delimiter(delimiter, true, false, ref_string); + auto error_code2 = buffered_file_reader.try_read_to_delimiter(delimiter, true, false, test_string); REQUIRE(error_code2 == error_code); REQUIRE(test_string == ref_string); } From 25e8dd0da6573dcd7c830cf7f0970e908aa06fd7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 18:16:58 -0400 Subject: [PATCH 064/121] small clean up --- components/core/src/BufferedFileReader.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 10e9df3bb..a31dce63f 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -4,8 +4,6 @@ #include // C standard libraries -// C libraries -#include #include // C++ standard libraries @@ -331,9 +329,8 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) if (error_code != ErrorCode_Success) { return error_code; } - m_buffer = std::move(new_buffer); - const auto prev_pos = m_buffer_reader->get_pos(); - m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled, prev_pos); + m_buffer = move(new_buffer); + m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled, m_buffer_reader->get_pos()); } return ErrorCode_Success; } @@ -348,7 +345,7 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { m_buffer_size = quantize_to_buffer_size(copy_size); auto new_buffer = make_unique(m_buffer_size); memcpy(new_buffer.get(), &m_buffer[pos], copy_size); - m_buffer = std::move(new_buffer); + m_buffer = move(new_buffer); m_buffer_begin_pos += pos; m_buffer_reader.emplace(m_buffer.get(), copy_size); From 5aa45950fb77eb093805407d5def9894c774f6ff Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 21:41:35 -0400 Subject: [PATCH 065/121] Refactored try_seek_from_begin function and made it consistent with decompressor's return code scheme --- components/core/src/BufferReader.cpp | 2 +- components/core/src/BufferedFileReader.cpp | 70 ++++++++++------------ 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index afbc3969c..23db95721 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -37,7 +37,7 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n ErrorCode BufferReader::try_seek_from_begin (size_t pos) { if (pos > m_internal_buf_size) { - return ErrorCode_OutOfBounds; + return ErrorCode_Truncated; } m_internal_buf_pos = pos; return ErrorCode_Success; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index a31dce63f..4efd14a4e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -71,45 +71,41 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } - if (m_checkpoint_pos.has_value() == false) { - if (pos < m_file_pos) { - return ErrorCode_Failure; - } - if (ErrorCode_Success == - m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))){ - m_file_pos = pos; - m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); - return ErrorCode_Success; - } - // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader - auto offset = lseek(m_fd, static_cast<__off64_t>(pos), SEEK_SET); - if (-1 == offset) { - return ErrorCode_errno; - } - m_buffer_reader.emplace(m_buffer.get(), 0); - m_buffer_begin_pos = pos; - } else { - if (pos < m_checkpoint_pos) { - return ErrorCode_Failure; - } - if (ErrorCode_Success == - m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))) { - m_file_pos = pos; - m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); - return ErrorCode_Success; - } + size_t seek_lower_bound = m_file_pos; + if (m_checkpoint_pos.has_value()) { + seek_lower_bound = m_checkpoint_pos.value(); + } - size_t num_bytes_to_refill = pos - get_buffer_end_pos(); - auto error_code = refill_reader_buffer(num_bytes_to_refill); - if (ErrorCode_EndOfFile == error_code) { - throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); - } - if (ErrorCode_Success != error_code) { - return error_code; - } - if (ErrorCode_Success != m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos))) { - throw OperationFailed(ErrorCode_EndOfFile, __FILENAME__, __LINE__); + if (pos < seek_lower_bound) { + return ErrorCode_Failure; + } + + auto error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); + if (ErrorCode_Truncated == error_code) { + if (false == m_checkpoint_pos.has_value()) { + // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader + auto offset = lseek(m_fd, static_cast<__off64_t>(pos), SEEK_SET); + if (-1 == offset) { + return ErrorCode_errno; + } + m_buffer_reader.emplace(m_buffer.get(), 0); + m_buffer_begin_pos = pos; + } else { + size_t num_bytes_to_refill = pos - get_buffer_end_pos(); + error_code = refill_reader_buffer(num_bytes_to_refill); + if (ErrorCode_EndOfFile == error_code) { + return ErrorCode_Truncated; + } + if (ErrorCode_Success != error_code) { + return error_code; + } + error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); + if (ErrorCode_Success != error_code){ + return error_code; + } } + } else if (ErrorCode_Success != error_code) { + return error_code; } m_file_pos = pos; m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); From d74c631bec03e84ac5c6c36f4f45a1333cf7ebe7 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 22:32:48 -0400 Subject: [PATCH 066/121] Refill_buffer_reader refactor --- components/core/src/BufferedFileReader.cpp | 61 ++++++++++------------ 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 4efd14a4e..20c99d13e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -287,47 +287,40 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) const auto buffer_end_pos = get_buffer_end_pos(); const auto data_size = m_buffer_reader->get_buffer_size(); + const auto available_buffer_space = m_buffer_size - data_size; + size_t buf_internal_pos; - size_t num_bytes_alignment = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); + size_t bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); if (m_checkpoint_pos.has_value()) { - while (num_bytes_alignment < num_bytes_to_refill) { - num_bytes_alignment += m_base_buffer_size; + while (bytes_to_read < num_bytes_to_refill) { + bytes_to_read += m_base_buffer_size; } - } - // Don't extend the underlying buffer if enough space is available - if (num_bytes_alignment < m_buffer_size - data_size) { - auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + data_size, - num_bytes_alignment, num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; + // Grow the buffer if bytes_to_read is more + // than available space in the buffer + if (bytes_to_read > available_buffer_space) { + m_buffer_size = data_size + bytes_to_read; + auto new_buffer = make_unique(m_buffer_size); + memcpy(new_buffer.get(), m_buffer.get(), data_size); + m_buffer = std::move(new_buffer); } - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + data_size, data_size); - return ErrorCode_Success; - } - - if (false == m_checkpoint_pos.has_value()) { - auto error_code = try_read_into_buffer(m_fd, m_buffer.get(), - num_bytes_alignment, num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; - } - m_buffer_begin_pos = buffer_end_pos; - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled); + // make Buffer-reader's pos pointing to end of current data + buf_internal_pos = data_size; } else { - // Messy way of copying data from old buffer to new buffer - m_buffer_size = data_size + num_bytes_alignment; - auto new_buffer = make_unique(m_buffer_size); - memcpy(new_buffer.get(), m_buffer.get(), data_size); - - // Read data to the new buffer, with offset = data_size - auto error_code = try_read_into_buffer(m_fd, &new_buffer[data_size], num_bytes_alignment, - num_bytes_refilled); - if (error_code != ErrorCode_Success) { - return error_code; + if (bytes_to_read > available_buffer_space) { + // advance the buffer + buf_internal_pos = 0; + m_buffer_begin_pos = buffer_end_pos; + } else { + buf_internal_pos = data_size; } - m_buffer = move(new_buffer); - m_buffer_reader.emplace(m_buffer.get(), data_size + num_bytes_refilled, m_buffer_reader->get_pos()); } + + auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + buf_internal_pos, + bytes_to_read, num_bytes_refilled); + if (error_code != ErrorCode_Success) { + return error_code; + } + m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); return ErrorCode_Success; } From 0906e4dafe908efb8912502c9592c4dfbd38e57f Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 8 Aug 2023 23:16:12 -0400 Subject: [PATCH 067/121] Fix comments for test --- .../core/tests/test-ir_encoding_methods.cpp | 82 ++++++++++--------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index ad0b1f8be..5ad9a2fe7 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -87,7 +87,7 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log * Helper function that decodes a message of encoding type = encoded_variable_t * from the ir_buf * @tparam encoded_variable_t Type of the encoded variable - * @param ir_buf + * @param reader * @param message * @param decoded_ts Returns the decoded timestamp * @return IRErrorCode_Success on success, otherwise @@ -97,7 +97,7 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log * encoded_variable_t == four_byte_encoded_variable_t */ template -IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& decoded_ts); +IRErrorCode decode_next_message (BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts); /** * Struct to hold the timestamp info from the IR stream's metadata @@ -184,15 +184,15 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log } template -IRErrorCode decode_next_message (BufferReader& ir_buf, string& message, epoch_time_ms_t& decoded_ts) { +IRErrorCode decode_next_message (BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts) { static_assert(is_same_v || is_same_v); if constexpr (is_same_v) { - return ffi::ir_stream::eight_byte_encoding::decode_next_message(ir_buf, message, + return ffi::ir_stream::eight_byte_encoding::decode_next_message(reader, message, decoded_ts); } else { - return ffi::ir_stream::four_byte_encoding::decode_next_message(ir_buf, message, decoded_ts); + return ffi::ir_stream::four_byte_encoding::decode_next_message(reader, message, decoded_ts); } } @@ -206,16 +206,16 @@ static void set_timestamp_info (const nlohmann::json& metadata_json, TimestampIn TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { bool is_four_bytes_encoding; + // Test eight-byte encoding vector eight_byte_encoding_vec{EightByteEncodingMagicNumber, EightByteEncodingMagicNumber + MagicNumberLength}; - // Test eight-byte encoding - BufferReader eight_byte_encoding_buffer{ + BufferReader eight_byte_ir_buffer{ size_checked_pointer_cast(eight_byte_encoding_vec.data()), eight_byte_encoding_vec.size() }; - REQUIRE(get_encoding_type(eight_byte_encoding_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(eight_byte_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -223,20 +223,23 @@ TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { vector four_byte_encoding_vec{FourByteEncodingMagicNumber, FourByteEncodingMagicNumber + MagicNumberLength}; - BufferReader four_byte_encoding_buffer{ + BufferReader four_byte_ir_buffer{ size_checked_pointer_cast(four_byte_encoding_vec.data()), four_byte_encoding_vec.size() }; - REQUIRE(get_encoding_type(four_byte_encoding_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(four_byte_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - // Test error on incomplete ir_buffer + // Test error on empty and incomplete ir_buffer + BufferReader empty_ir_buffer(size_checked_pointer_cast(four_byte_encoding_vec.data()), 0); + REQUIRE(get_encoding_type(empty_ir_buffer, is_four_bytes_encoding) == + IRErrorCode::IRErrorCode_Incomplete_IR); + BufferReader incomplete_buffer{ size_checked_pointer_cast(four_byte_encoding_vec.data()), four_byte_encoding_vec.size() - 1 }; - REQUIRE(get_encoding_type(incomplete_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Incomplete_IR); @@ -264,37 +267,28 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode const size_t encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - BufferReader encoding_buffer{ + BufferReader ir_buffer{ size_checked_pointer_cast(ir_buf.data()), ir_buf.size() }; bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(encoding_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); - REQUIRE(MagicNumberLength == encoding_buffer.get_pos()); + REQUIRE(MagicNumberLength == ir_buffer.get_pos()); // Test if preamble can be decoded correctly TimestampInfo ts_info; encoded_tag_t metadata_type{0}; size_t metadata_pos{0}; uint16_t metadata_size{0}; - REQUIRE(decode_preamble(encoding_buffer, metadata_type, metadata_pos, metadata_size) == + REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == encoding_buffer.get_pos()); - - auto json_metadata_ptr = reinterpret_cast(ir_buf.data() + metadata_pos); - string_view json_metadata_ref {json_metadata_ptr, metadata_size}; + REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); - // Test if preamble can be decoded by the string copy method - std::vector json_metadata_vec; - encoding_buffer.seek_from_begin(MagicNumberLength); - REQUIRE(decode_preamble(encoding_buffer, metadata_type, json_metadata_vec) == - IRErrorCode::IRErrorCode_Success); - string_view json_metadata_copied { reinterpret_cast(json_metadata_vec.data()), - json_metadata_vec.size() }; - REQUIRE (json_metadata_copied == json_metadata_ref); + char* metadata_ptr{size_checked_pointer_cast(ir_buf.data()) + metadata_pos}; + string_view json_metadata{metadata_ptr, metadata_size}; - auto metadata_json = nlohmann::json::parse(json_metadata_ref); + auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); REQUIRE(ffi::ir_stream::cProtocol::Metadata::EncodingJson == metadata_type); @@ -302,7 +296,7 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); REQUIRE(time_zone_id == ts_info.time_zone_id); REQUIRE(timestamp_pattern == ts_info.timestamp_pattern); - REQUIRE(encoded_preamble_end_pos == encoding_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); if constexpr (is_same_v) { REQUIRE(reference_ts == @@ -311,6 +305,15 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode .get())); } + // Test if preamble can be decoded by the string copy method + std::vector json_metadata_vec; + ir_buffer.seek_from_begin(MagicNumberLength); + REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) == + IRErrorCode::IRErrorCode_Success); + string_view json_metadata_copied {size_checked_pointer_cast(json_metadata_vec.data()), json_metadata_vec.size()}; + // Crosscheck with the json_metadata decoded previously + REQUIRE (json_metadata_copied == json_metadata); + // Test if incomplete IR can be detected ir_buf.resize(encoded_preamble_end_pos - 1); BufferReader incomplete_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), @@ -343,12 +346,11 @@ TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", const size_t encoded_message_end_pos = ir_buf.size(); const size_t encoded_message_start_pos = 0; - // Test if message can be decoded properly - BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; string decoded_message; epoch_time_ms_t timestamp; + // Test if message can be decoded properly REQUIRE(IRErrorCode::IRErrorCode_Success == decode_next_message(ir_buffer, decoded_message, timestamp)); REQUIRE(message == decoded_message); @@ -393,7 +395,7 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") epoch_time_ms_t timestamp; // Test if a trailing escape triggers a decoder error - auto ir_with_extra_escape {ir_buf}; + auto ir_with_extra_escape{ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; BufferReader ir_with_extra_escape_buffer{ size_checked_pointer_cast(ir_with_extra_escape.data()), @@ -471,11 +473,11 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", reference_messages.push_back(message); reference_timestamps.push_back(ts); - BufferReader complete_encoding_buffer{size_checked_pointer_cast(ir_buf.data()), + BufferReader complete_ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(complete_encoding_buffer, is_four_bytes_encoding) == + REQUIRE(get_encoding_type(complete_ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); @@ -484,11 +486,11 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", encoded_tag_t metadata_type; size_t metadata_pos; uint16_t metadata_size; - REQUIRE(decode_preamble(complete_encoding_buffer, metadata_type, metadata_pos, metadata_size) == + REQUIRE(decode_preamble(complete_ir_buffer, metadata_type, metadata_pos, metadata_size) == IRErrorCode::IRErrorCode_Success); - REQUIRE(encoded_preamble_end_pos == complete_encoding_buffer.get_pos()); + REQUIRE(encoded_preamble_end_pos == complete_ir_buffer.get_pos()); - auto json_metadata_ptr = reinterpret_cast(ir_buf.data() + metadata_pos); + auto* json_metadata_ptr{size_checked_pointer_cast(ir_buf.data() + metadata_pos)}; string_view json_metadata {json_metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == @@ -503,10 +505,10 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", epoch_time_ms_t timestamp; for (size_t ix = 0; ix < reference_messages.size(); ix++) { REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(complete_encoding_buffer, decoded_message, + decode_next_message(complete_ir_buffer, decoded_message, timestamp)); REQUIRE(decoded_message == reference_messages[ix]); REQUIRE(timestamp == reference_timestamps[ix]); } - REQUIRE(complete_encoding_buffer.get_pos() == ir_buf.size()); + REQUIRE(complete_ir_buffer.get_pos() == ir_buf.size()); } From 504a030644f0900effb9ce869ad14f7e4ecdbe17 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 10:14:57 -0400 Subject: [PATCH 068/121] fixes --- components/core/src/BufferedFileReader.cpp | 27 ++++++++++------------ components/core/src/BufferedFileReader.hpp | 16 +++++++++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 20c99d13e..39a9a44af 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -9,11 +9,7 @@ // C++ standard libraries #include -// Project headers -#include - using std::make_unique; -using std::move; using std::string; namespace { @@ -107,8 +103,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { } else if (ErrorCode_Success != error_code) { return error_code; } - m_file_pos = pos; - m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); + update_file_pos(pos); return ErrorCode_Success; } @@ -132,7 +127,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, buf += bytes_read; num_bytes_read += bytes_read; num_bytes_to_read -= bytes_read; - m_file_pos += bytes_read; + update_file_pos(m_file_pos + bytes_read); if (0 == num_bytes_to_read) { break; } @@ -150,7 +145,6 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, if (0 == num_bytes_read) { return ErrorCode_EndOfFile; } - m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -170,7 +164,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim ret_code != ErrorCode_Success && ret_code != ErrorCode_EndOfFile) { return ret_code; } - m_file_pos += length; + update_file_pos(m_file_pos + length); total_append_length += length; if (false == found_delim) { @@ -185,7 +179,6 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim } } } - m_highest_read_pos = std::max(m_highest_read_pos, m_file_pos); return ErrorCode_Success; } @@ -212,8 +205,8 @@ void BufferedFileReader::open (const string& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - SPDLOG_ERROR("File not found: {}", boost::filesystem::weakly_canonical(path).string()); - throw OperationFailed(error_code, __FILENAME__, __LINE__); + throw OperationFailedWithMsg(error_code, __FILENAME__, __LINE__, + "File not found: " + boost::filesystem::weakly_canonical(path).string()); } else { throw OperationFailed(error_code, __FILENAME__, __LINE__); } @@ -303,11 +296,10 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) memcpy(new_buffer.get(), m_buffer.get(), data_size); m_buffer = std::move(new_buffer); } - // make Buffer-reader's pos pointing to end of current data buf_internal_pos = data_size; } else { if (bytes_to_read > available_buffer_space) { - // advance the buffer + // advance the entire buffer buf_internal_pos = 0; m_buffer_begin_pos = buffer_end_pos; } else { @@ -334,8 +326,13 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { m_buffer_size = quantize_to_buffer_size(copy_size); auto new_buffer = make_unique(m_buffer_size); memcpy(new_buffer.get(), &m_buffer[pos], copy_size); - m_buffer = move(new_buffer); + m_buffer = std::move(new_buffer); m_buffer_begin_pos += pos; m_buffer_reader.emplace(m_buffer.get(), copy_size); +} + +void BufferedFileReader::update_file_pos (size_t pos) { + m_file_pos = pos; + m_highest_read_pos = std::max(m_file_pos, m_highest_read_pos); } \ No newline at end of file diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 2ebe58da4..27ff28700 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -38,6 +38,20 @@ class BufferedFileReader : public ReaderInterface { } }; + class OperationFailedWithMsg : public TraceableException { + public: + // Constructors + OperationFailedWithMsg (ErrorCode error_code, const char* const filename, int line_number, std::string message) : + TraceableException (error_code, filename, line_number), m_message(message) {} + + // Methods + [[nodiscard]] const char* what () const noexcept override { + return "BufferedFileReader operation failed"; + } + private: + std::string m_message; + }; + // Constructors BufferedFileReader(size_t base_buffer_size); BufferedFileReader() : BufferedFileReader(cDefaultBufferSize) {} @@ -202,6 +216,8 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] size_t get_buffer_end_pos() const { return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); } + void update_file_pos(size_t pos); + // Constants static constexpr size_t cMinBufferSize = (1ULL << 12); static constexpr size_t cDefaultBufferSize = (16 * cMinBufferSize); From 66aa40582d75d5d8223e8ca38f0345cc6a3c0024 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 10:21:46 -0400 Subject: [PATCH 069/121] Apply clang-format to merge-conflict files --- .../src/ffi/ir_stream/decoding_methods.cpp | 958 +++++++++--------- .../src/ffi/ir_stream/decoding_methods.hpp | 158 +-- .../src/ffi/ir_stream/encoding_methods.hpp | 150 +-- 3 files changed, 666 insertions(+), 600 deletions(-) diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index ccb90ce09..076bd53be 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -1,536 +1,578 @@ #include "decoding_methods.hpp" // Project headers +#include "../../type_utils.hpp" #include "byteswap.hpp" #include "protocol_constants.hpp" -#include "../../type_utils.hpp" using std::is_same_v; using std::string; using std::vector; namespace ffi::ir_stream { - /** - * @tparam encoded_variable_t Type of the encoded variable - * @param tag - * @param is_encoded_var Returns true if tag is for an encoded variable (as - * opposed to a dictionary variable) - * @return Whether the tag is a variable tag - */ - template - static bool is_variable_tag (encoded_tag_t tag, bool& is_encoded_var); - - /** - * Decodes an integer from reader - * @tparam integer_t Type of the integer to decode - * @param reader - * @param value Returns the decoded integer - * @return true on success, false if the reader doesn't contain enough data - * to decode - */ - template - static bool decode_int (ReaderInterface& reader, integer_t& value); - - /** - * Decodes the next logtype string from reader - * @param reader - * @param encoded_tag - * @param logtype Returns the logtype string - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - static IRErrorCode parse_logtype (ReaderInterface& reader, encoded_tag_t encoded_tag, - string& logtype); - - /** - * Decodes the next dictionary-type variable string from reader - * @param reader - * @param encoded_tag - * @param dict_var Returns the dictionary variable - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough - * data to decode - */ - static IRErrorCode parse_dictionary_var (ReaderInterface& reader, encoded_tag_t encoded_tag, - string& dict_var); - - /** - * Parses the next timestamp from reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param encoded_tag - * @param ts Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - template - IRErrorCode parse_timestamp (ReaderInterface& reader, encoded_tag_t encoded_tag, - epoch_time_ms_t& ts); - - /** - * Decodes the next encoded message from reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param message Returns the decoded message - * @param timestamp Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - template - static IRErrorCode generic_decode_next_message (ReaderInterface& reader, - string& message, - epoch_time_ms_t& timestamp); - - /** - * Reads metadata information from the reader - * @param reader - * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in reader - * @param metadata_size Returns the size of the metadata written in the IR - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - static IRErrorCode read_metadata_info (ReaderInterface& reader, - encoded_tag_t& metadata_type, uint16_t& metadata_size); - - /** - * Decodes the message from the given logtype, encoded variables, and - * dictionary variables. This function properly handles escaped variable - * placeholders in the logtype, as opposed to ffi::decode_message that - * doesn't handle escaped placeholders for simplicity - * @tparam encoded_variable_t Type of the encoded variable - * @param logtype - * @param encoded_vars - * @param dictionary_vars - * @return The decoded message - * @throw EncodingException if the message can't be decoded - */ - template - static string decode_message ( - const string& logtype, - const vector& encoded_vars, - const vector& dictionary_vars - ); - - template - static bool is_variable_tag (encoded_tag_t tag, bool& is_encoded_var) { - static_assert(is_same_v || - is_same_v); - - if (tag == cProtocol::Payload::VarStrLenUByte || - tag == cProtocol::Payload::VarStrLenUShort || - tag == cProtocol::Payload::VarStrLenInt) { - is_encoded_var = false; +/** + * @tparam encoded_variable_t Type of the encoded variable + * @param tag + * @param is_encoded_var Returns true if tag is for an encoded variable (as + * opposed to a dictionary variable) + * @return Whether the tag is a variable tag + */ +template +static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var); + +/** + * Decodes an integer from reader + * @tparam integer_t Type of the integer to decode + * @param reader + * @param value Returns the decoded integer + * @return true on success, false if the reader doesn't contain enough data + * to decode + */ +template +static bool decode_int(ReaderInterface& reader, integer_t& value); + +/** + * Decodes the next logtype string from reader + * @param reader + * @param encoded_tag + * @param logtype Returns the logtype string + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ +static IRErrorCode +parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype); + +/** + * Decodes the next dictionary-type variable string from reader + * @param reader + * @param encoded_tag + * @param dict_var Returns the dictionary variable + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough + * data to decode + */ +static IRErrorCode +parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var); + +/** + * Parses the next timestamp from reader + * @tparam encoded_variable_t Type of the encoded variable + * @param reader + * @param encoded_tag + * @param ts Returns the timestamp delta if + * encoded_variable_t == four_byte_encoded_variable_t or the actual + * timestamp if encoded_variable_t == eight_byte_encoded_variable_t + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ +template +IRErrorCode +parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); + +/** + * Decodes the next encoded message from reader + * @tparam encoded_variable_t Type of the encoded variable + * @param reader + * @param message Returns the decoded message + * @param timestamp Returns the timestamp delta if + * encoded_variable_t == four_byte_encoded_variable_t or the actual + * timestamp if encoded_variable_t == eight_byte_encoded_variable_t + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Decode_Error if the encoded message cannot be + * properly decoded + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ +template +static IRErrorCode +generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp); + +/** + * Reads metadata information from the reader + * @param reader + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata_pos Returns the starting position of the metadata in reader + * @param metadata_size Returns the size of the metadata written in the IR + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ +static IRErrorCode +read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size); + +/** + * Decodes the message from the given logtype, encoded variables, and + * dictionary variables. This function properly handles escaped variable + * placeholders in the logtype, as opposed to ffi::decode_message that + * doesn't handle escaped placeholders for simplicity + * @tparam encoded_variable_t Type of the encoded variable + * @param logtype + * @param encoded_vars + * @param dictionary_vars + * @return The decoded message + * @throw EncodingException if the message can't be decoded + */ +template +static string decode_message( + string const& logtype, + vector const& encoded_vars, + vector const& dictionary_vars +); + +template +static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) { + static_assert(is_same_v || is_same_v); + + if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort + || tag == cProtocol::Payload::VarStrLenInt) + { + is_encoded_var = false; + return true; + } + + if constexpr (is_same_v) { + if (tag == cProtocol::Payload::VarEightByteEncoding) { + is_encoded_var = true; return true; } - - if constexpr (is_same_v) { - if (tag == cProtocol::Payload::VarEightByteEncoding) { - is_encoded_var = true; - return true; - } - } else { - if (tag == cProtocol::Payload::VarFourByteEncoding) { - is_encoded_var = true; - return true; - } + } else { + if (tag == cProtocol::Payload::VarFourByteEncoding) { + is_encoded_var = true; + return true; } + } + return false; +} + +template +static bool decode_int(ReaderInterface& reader, integer_t& value) { + integer_t value_little_endian; + if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { return false; } - template - static bool decode_int (ReaderInterface& reader, integer_t& value) { - integer_t value_little_endian; - if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { - return false; - } + constexpr auto read_size = sizeof(integer_t); + static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); + if constexpr (read_size == 1) { + value = value_little_endian; + } else if constexpr (read_size == 2) { + value = bswap_16(value_little_endian); + } else if constexpr (read_size == 4) { + value = bswap_32(value_little_endian); + } else if constexpr (read_size == 8) { + value = bswap_64(value_little_endian); + } + return true; +} - constexpr auto read_size = sizeof(integer_t); - static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); - if constexpr (read_size == 1) { - value = value_little_endian; - } else if constexpr (read_size == 2) { - value = bswap_16(value_little_endian); - } else if constexpr (read_size == 4) { - value = bswap_32(value_little_endian); - } else if constexpr (read_size == 8) { - value = bswap_64(value_little_endian); +static IRErrorCode +parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { + size_t logtype_length; + if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { + uint8_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; } - return true; + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else { + return IRErrorCode_Corrupted_IR; } - static IRErrorCode parse_logtype (ReaderInterface& reader, encoded_tag_t encoded_tag, - string& logtype) - { - size_t logtype_length; - if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { - uint8_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { - uint16_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { - int32_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else { - return IRErrorCode_Corrupted_IR; - } + if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { + return IRErrorCode_Incomplete_IR; + } + return IRErrorCode_Success; +} - if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { +static IRErrorCode +parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { + // Decode variable's length + size_t var_length; + if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { + uint8_t length; + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } - return IRErrorCode_Success; + var_length = length; + } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else { + return IRErrorCode_Corrupted_IR; + } + + // Read the dictionary variable + if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { + return IRErrorCode_Incomplete_IR; } - static IRErrorCode parse_dictionary_var (ReaderInterface& reader, - encoded_tag_t encoded_tag, string& dict_var) { - // Decode variable's length - size_t var_length; - if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { - uint8_t length; - if (false == decode_int(reader, length)) { + return IRErrorCode_Success; +} + +template +IRErrorCode +parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { + static_assert(is_same_v || is_same_v); + + if constexpr (is_same_v) { + if (cProtocol::Payload::TimestampVal != encoded_tag) { + return IRErrorCode_Corrupted_IR; + } + if (false == decode_int(reader, ts)) { + return IRErrorCode_Incomplete_IR; + } + } else { + if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { + int8_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { - uint16_t length; - if (false == decode_int(reader, length)) { + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { + int16_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { - int32_t length; - if (false == decode_int(reader, length)) { + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { + int32_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; + ts = ts_delta; } else { return IRErrorCode_Corrupted_IR; } - - // Read the dictionary variable - if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { - return IRErrorCode_Incomplete_IR; - } - - return IRErrorCode_Success; } + return IRErrorCode_Success; +} - template - IRErrorCode parse_timestamp (ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) - { - static_assert(is_same_v || - is_same_v); +template +static IRErrorCode +generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { + encoded_tag_t encoded_tag; + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { + return IRErrorCode_Incomplete_IR; + } + if (cProtocol::Eof == encoded_tag) { + return IRErrorCode_Eof; + } - if constexpr (is_same_v) { - if (cProtocol::Payload::TimestampVal != encoded_tag) { - return IRErrorCode_Corrupted_IR; - } - if (false == decode_int(reader, ts)) { + // Handle variables + vector encoded_vars; + vector dict_vars; + encoded_variable_t encoded_variable; + string var_str; + bool is_encoded_var; + while (is_variable_tag(encoded_tag, is_encoded_var)) { + if (is_encoded_var) { + if (false == decode_int(reader, encoded_variable)) { return IRErrorCode_Incomplete_IR; } + encoded_vars.push_back(encoded_variable); } else { - if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { - int8_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { - int16_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { - int32_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else { - return IRErrorCode_Corrupted_IR; + if (auto error_code = parse_dictionary_var(reader, encoded_tag, var_str); + IRErrorCode_Success != error_code) + { + return error_code; } + dict_vars.emplace_back(var_str); } - return IRErrorCode_Success; - } - - template - static IRErrorCode generic_decode_next_message (ReaderInterface& reader, string& message, - epoch_time_ms_t& timestamp) - { - encoded_tag_t encoded_tag; if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } - if (cProtocol::Eof == encoded_tag) { - return IRErrorCode_Eof; - } + } - // Handle variables - vector encoded_vars; - vector dict_vars; - encoded_variable_t encoded_variable; - string var_str; - bool is_encoded_var; - while (is_variable_tag(encoded_tag, is_encoded_var)) { - if (is_encoded_var) { - if (false == decode_int(reader, encoded_variable)) { - return IRErrorCode_Incomplete_IR; - } - encoded_vars.push_back(encoded_variable); - } else { - if (auto error_code = parse_dictionary_var(reader, encoded_tag, var_str); - IRErrorCode_Success != error_code) - { - return error_code; - } - dict_vars.emplace_back(var_str); - } - if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { - return IRErrorCode_Incomplete_IR; - } - } + // Handle logtype + string logtype; + if (auto error_code = parse_logtype(reader, encoded_tag, logtype); + IRErrorCode_Success != error_code) + { + return error_code; + } - // Handle logtype - string logtype; - if (auto error_code = parse_logtype(reader, encoded_tag, logtype); - IRErrorCode_Success != error_code) - { - return error_code; - } + // NOTE: for the eight-byte encoding, the timestamp is the actual + // timestamp; for the four-byte encoding, the timestamp is a timestamp + // delta + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { + return IRErrorCode_Incomplete_IR; + } + if (auto error_code = parse_timestamp(reader, encoded_tag, timestamp); + IRErrorCode_Success != error_code) + { + return error_code; + } - // NOTE: for the eight-byte encoding, the timestamp is the actual - // timestamp; for the four-byte encoding, the timestamp is a timestamp - // delta - if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { - return IRErrorCode_Incomplete_IR; - } - if (auto error_code = parse_timestamp(reader, encoded_tag, timestamp); - IRErrorCode_Success != error_code) { - return error_code; - } + try { + message = decode_message(logtype, encoded_vars, dict_vars); + } catch (EncodingException const& e) { + return IRErrorCode_Decode_Error; + } + return IRErrorCode_Success; +} - try { - message = decode_message(logtype, encoded_vars, dict_vars); - } catch (const EncodingException& e) { - return IRErrorCode_Decode_Error; - } - return IRErrorCode_Success; +static IRErrorCode +read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size) { + if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { + return IRErrorCode_Incomplete_IR; } - static IRErrorCode read_metadata_info (ReaderInterface& reader, encoded_tag_t& metadata_type, - uint16_t& metadata_size) { - if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { - return IRErrorCode_Incomplete_IR; - } + // Read metadata length + encoded_tag_t encoded_tag; + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { + return IRErrorCode_Incomplete_IR; + } + switch (encoded_tag) { + case cProtocol::Metadata::LengthUByte: + uint8_t ubyte_res; + if (false == decode_int(reader, ubyte_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ubyte_res; + break; + case cProtocol::Metadata::LengthUShort: + uint16_t ushort_res; + if (false == decode_int(reader, ushort_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ushort_res; + break; + default: + return IRErrorCode_Corrupted_IR; + } + return IRErrorCode_Success; +} - // Read metadata length - encoded_tag_t encoded_tag; - if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { - return IRErrorCode_Incomplete_IR; - } - switch (encoded_tag) { - case cProtocol::Metadata::LengthUByte: - uint8_t ubyte_res; - if (false == decode_int(reader, ubyte_res)) { - return IRErrorCode_Incomplete_IR; +template +static string decode_message( + string const& logtype, + vector const& encoded_vars, + vector const& dictionary_vars +) { + string message; + size_t encoded_vars_length = encoded_vars.size(); + size_t dict_vars_length = dictionary_vars.size(); + size_t next_static_text_begin_pos = 0; + + size_t dictionary_vars_ix = 0; + size_t encoded_vars_ix = 0; + for (size_t cur_pos = 0; cur_pos < logtype.length(); ++cur_pos) { + auto c = logtype[cur_pos]; + switch (c) { + case enum_to_underlying_type(VariablePlaceholder::Float): { + message.append( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (encoded_vars_ix >= encoded_vars_length) { + throw EncodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewEncodedVarsErrorMessage + ); } - metadata_size = ubyte_res; + message.append(decode_float_var(encoded_vars[encoded_vars_ix])); + ++encoded_vars_ix; + break; - case cProtocol::Metadata::LengthUShort: - uint16_t ushort_res; - if (false == decode_int(reader, ushort_res)) { - return IRErrorCode_Incomplete_IR; + } + + case enum_to_underlying_type(VariablePlaceholder::Integer): { + message.append( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (encoded_vars_ix >= encoded_vars_length) { + throw EncodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewEncodedVarsErrorMessage + ); } - metadata_size = ushort_res; + message.append(decode_integer_var(encoded_vars[encoded_vars_ix])); + ++encoded_vars_ix; + break; - default: - return IRErrorCode_Corrupted_IR; - } - return IRErrorCode_Success; - } + } - template - static string decode_message ( - const string& logtype, - const vector& encoded_vars, - const vector& dictionary_vars - ) { - string message; - size_t encoded_vars_length = encoded_vars.size(); - size_t dict_vars_length = dictionary_vars.size(); - size_t next_static_text_begin_pos = 0; - - size_t dictionary_vars_ix = 0; - size_t encoded_vars_ix = 0; - for (size_t cur_pos = 0; cur_pos < logtype.length(); ++cur_pos) { - auto c = logtype[cur_pos]; - switch(c) { - case enum_to_underlying_type(VariablePlaceholder::Float): { - message.append(logtype, next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos); - next_static_text_begin_pos = cur_pos + 1; - if (encoded_vars_ix >= encoded_vars_length) { - throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, - cTooFewEncodedVarsErrorMessage); - } - message.append(decode_float_var(encoded_vars[encoded_vars_ix])); - ++encoded_vars_ix; - - break; + case enum_to_underlying_type(VariablePlaceholder::Dictionary): { + message.append( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (dictionary_vars_ix >= dict_vars_length) { + throw EncodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewDictionaryVarsErrorMessage + ); } + message.append(dictionary_vars[dictionary_vars_ix]); + ++dictionary_vars_ix; - case enum_to_underlying_type(VariablePlaceholder::Integer): { - message.append(logtype, next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos); - next_static_text_begin_pos = cur_pos + 1; - if (encoded_vars_ix >= encoded_vars_length) { - throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, - cTooFewEncodedVarsErrorMessage); - } - message.append(decode_integer_var(encoded_vars[encoded_vars_ix])); - ++encoded_vars_ix; - - break; - } + break; + } - case enum_to_underlying_type(VariablePlaceholder::Dictionary): { - message.append(logtype, next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos); - next_static_text_begin_pos = cur_pos + 1; - if (dictionary_vars_ix >= dict_vars_length) { - throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, - cTooFewDictionaryVarsErrorMessage); - } - message.append(dictionary_vars[dictionary_vars_ix]); - ++dictionary_vars_ix; - - break; + case cVariablePlaceholderEscapeCharacter: { + // Ensure the escape character is followed by a + // character that's being escaped + if (cur_pos == logtype.length() - 1) { + throw EncodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cUnexpectedEscapeCharacterMessage + ); } + message.append( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + + // Skip the escape character + next_static_text_begin_pos = cur_pos + 1; + // The character after the escape character is static text + // (regardless of whether it is a variable placeholder), so + // increment cur_pos by 1 to ensure we don't process the + // next character in any of the other cases (instead it will + // be added to the message). + ++cur_pos; - case cVariablePlaceholderEscapeCharacter: { - // Ensure the escape character is followed by a - // character that's being escaped - if (cur_pos == logtype.length() - 1) { - throw EncodingException(ErrorCode_Corrupt, __FILENAME__, __LINE__, - cUnexpectedEscapeCharacterMessage); - } - message.append(logtype, next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos); - - // Skip the escape character - next_static_text_begin_pos = cur_pos + 1; - // The character after the escape character is static text - // (regardless of whether it is a variable placeholder), so - // increment cur_pos by 1 to ensure we don't process the - // next character in any of the other cases (instead it will - // be added to the message). - ++cur_pos; - - break; - } + break; } } - // Add remainder - if (next_static_text_begin_pos < logtype.length()) { - message.append(logtype, next_static_text_begin_pos, - logtype.length() - next_static_text_begin_pos); - } - - return message; } - - IRErrorCode get_encoding_type (ReaderInterface& reader, bool& is_four_bytes_encoding) { - char buffer[cProtocol::MagicNumberLength]; - auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); - if (error_code != ErrorCode_Success) { - return IRErrorCode_Incomplete_IR; - } - if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, - cProtocol::MagicNumberLength)) { - is_four_bytes_encoding = true; - } else if (0 == memcmp(buffer, cProtocol::EightByteEncodingMagicNumber, - cProtocol::MagicNumberLength)) { - is_four_bytes_encoding = false; - } else { - return IRErrorCode_Corrupted_IR; - } - return IRErrorCode_Success; + // Add remainder + if (next_static_text_begin_pos < logtype.length()) { + message.append( + logtype, + next_static_text_begin_pos, + logtype.length() - next_static_text_begin_pos + ); } - IRErrorCode decode_preamble (ReaderInterface& reader, encoded_tag_t& metadata_type, - size_t& metadata_pos, uint16_t& metadata_size) + return message; +} + +IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) { + char buffer[cProtocol::MagicNumberLength]; + auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); + if (error_code != ErrorCode_Success) { + return IRErrorCode_Incomplete_IR; + } + if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, cProtocol::MagicNumberLength)) { + is_four_bytes_encoding = true; + } else if (0 == memcmp(buffer, cProtocol::EightByteEncodingMagicNumber, cProtocol::MagicNumberLength)) { - if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); - error_code != IRErrorCode_Success) { - return error_code; - } - metadata_pos = reader.get_pos(); - if (ErrorCode_Success != reader.try_seek_from_begin(metadata_pos + metadata_size)) { - return IRErrorCode_Incomplete_IR; - } - return IRErrorCode_Success; + is_four_bytes_encoding = false; + } else { + return IRErrorCode_Corrupted_IR; } + return IRErrorCode_Success; +} - IRErrorCode decode_preamble (ReaderInterface& reader, encoded_tag_t& metadata_type, - std::vector& metadata) +IRErrorCode decode_preamble( + ReaderInterface& reader, + encoded_tag_t& metadata_type, + size_t& metadata_pos, + uint16_t& metadata_size +) { + if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); + error_code != IRErrorCode_Success) { - uint16_t metadata_size; - if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); - error_code != IRErrorCode_Success) { - return error_code; - } - - metadata.resize(metadata_size); - if (ErrorCode_Success != reader.try_read_exact_length( - size_checked_pointer_cast(metadata.data()), metadata_size)) { - return IRErrorCode_Incomplete_IR; - } - return IRErrorCode_Success; + return error_code; } + metadata_pos = reader.get_pos(); + if (ErrorCode_Success != reader.try_seek_from_begin(metadata_pos + metadata_size)) { + return IRErrorCode_Incomplete_IR; + } + return IRErrorCode_Success; +} - namespace four_byte_encoding { - IRErrorCode decode_next_message (ReaderInterface& reader, string& message, - epoch_time_ms_t& timestamp_delta) - { - return generic_decode_next_message( - reader, message, timestamp_delta - ); - } +IRErrorCode decode_preamble( + ReaderInterface& reader, + encoded_tag_t& metadata_type, + std::vector& metadata +) { + uint16_t metadata_size; + if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); + error_code != IRErrorCode_Success) + { + return error_code; } - namespace eight_byte_encoding { - IRErrorCode decode_next_message (ReaderInterface& reader, string& message, - epoch_time_ms_t& timestamp) - { - return generic_decode_next_message( - reader, message, timestamp - ); - } + metadata.resize(metadata_size); + if (ErrorCode_Success + != reader.try_read_exact_length( + size_checked_pointer_cast(metadata.data()), + metadata_size + )) + { + return IRErrorCode_Incomplete_IR; } + return IRErrorCode_Success; } + +namespace four_byte_encoding { + IRErrorCode decode_next_message( + ReaderInterface& reader, + string& message, + epoch_time_ms_t& timestamp_delta + ) { + return generic_decode_next_message( + reader, + message, + timestamp_delta + ); + } +} // namespace four_byte_encoding + +namespace eight_byte_encoding { + IRErrorCode + decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { + return generic_decode_next_message( + reader, + message, + timestamp + ); + } +} // namespace eight_byte_encoding +} // namespace ffi::ir_stream diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 318e91e97..dfa568623 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -10,89 +10,99 @@ #include "../encoding_methods.hpp" namespace ffi::ir_stream { - using encoded_tag_t = int8_t; +using encoded_tag_t = int8_t; - typedef enum { - IRErrorCode_Success, - IRErrorCode_Decode_Error, - IRErrorCode_Eof, - IRErrorCode_Corrupted_IR, - IRErrorCode_Incomplete_IR, - } IRErrorCode; +typedef enum { + IRErrorCode_Success, + IRErrorCode_Decode_Error, + IRErrorCode_Eof, + IRErrorCode_Corrupted_IR, + IRErrorCode_Incomplete_IR, +} IRErrorCode; +/** + * Decodes the encoding type for the encoded IR stream + * @param ir_buf + * @param is_four_bytes_encoding Returns the encoding type + * @return ErrorCode_Success on success + * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to + * decode + */ +IRErrorCode get_encoding_type(ReaderInterface& ir_buf, bool& is_four_bytes_encoding); + +/** + * Decodes the preamble for an IR stream. + * @param ir_buf + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata_pos Returns the starting position of the metadata in ir_buf + * @param metadata_size Returns the size of the metadata written in the IR + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough + * data to decode + */ +IRErrorCode decode_preamble( + ReaderInterface& ir_buf, + encoded_tag_t& metadata_type, + size_t& metadata_pos, + uint16_t& metadata_size +); + +/** + * Decodes the preamble for an IR stream. + * @param ir_buf + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata Returns the metadata as a vector by reference + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough + * data to decode + */ +IRErrorCode decode_preamble( + ReaderInterface& ir_buf, + encoded_tag_t& metadata_type, + std::vector& metadata +); + +namespace eight_byte_encoding { /** - * Decodes the encoding type for the encoded IR stream + * Decodes the next message for the eight-byte encoding IR stream. * @param ir_buf - * @param is_four_bytes_encoding Returns the encoding type + * @param message Returns the decoded message + * @param timestamp Returns the decoded timestamp * @return ErrorCode_Success on success * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to - * decode + * @return ErrorCode_Decode_Error if the encoded message cannot be + * properly decoded + * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * to decode + * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode get_encoding_type (ReaderInterface& ir_buf, bool& is_four_bytes_encoding); + IRErrorCode + decode_next_message(ReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp); +} // namespace eight_byte_encoding +namespace four_byte_encoding { /** - * Decodes the preamble for an IR stream. + * Decodes the next message for the four-byte encoding IR stream. * @param ir_buf - * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in ir_buf - * @param metadata_size Returns the size of the metadata written in the IR - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough - * data to decode - */ - IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, - size_t& metadata_pos, uint16_t& metadata_size); - - /** - * Decodes the preamble for an IR stream. - * @param ir_buf - * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata Returns the metadata as a vector by reference - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough - * data to decode + * @param message Returns the decoded message + * @param timestamp_delta Returns the decoded timestamp delta + * @return ErrorCode_Success on success + * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR + * @return ErrorCode_Decode_Error if the encoded message cannot be + * properly decoded + * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data + * to decode + * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_preamble (ReaderInterface& ir_buf, encoded_tag_t& metadata_type, - std::vector& metadata); - - namespace eight_byte_encoding { - /** - * Decodes the next message for the eight-byte encoding IR stream. - * @param ir_buf - * @param message Returns the decoded message - * @param timestamp Returns the decoded timestamp - * @return ErrorCode_Success on success - * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data - * to decode - * @return ErrorCode_End_of_IR if the IR ends - */ - IRErrorCode decode_next_message (ReaderInterface& ir_buf, std::string& message, - epoch_time_ms_t& timestamp); - } - - namespace four_byte_encoding { - /** - * Decodes the next message for the four-byte encoding IR stream. - * @param ir_buf - * @param message Returns the decoded message - * @param timestamp_delta Returns the decoded timestamp delta - * @return ErrorCode_Success on success - * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data - * to decode - * @return ErrorCode_End_of_IR if the IR ends - */ - IRErrorCode decode_next_message (ReaderInterface& ir_buf, std::string& message, - epoch_time_ms_t& timestamp_delta); - } -} + IRErrorCode decode_next_message( + ReaderInterface& ir_buf, + std::string& message, + epoch_time_ms_t& timestamp_delta + ); +} // namespace four_byte_encoding +} // namespace ffi::ir_stream -#endif //FFI_IR_STREAM_DECODING_METHODS_HPP +#endif // FFI_IR_STREAM_DECODING_METHODS_HPP diff --git a/components/core/src/ffi/ir_stream/encoding_methods.hpp b/components/core/src/ffi/ir_stream/encoding_methods.hpp index d1052d07b..2d7d26a75 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.hpp @@ -9,77 +9,91 @@ #include "../encoding_methods.hpp" namespace ffi::ir_stream { - namespace eight_byte_encoding { - /** - * Encodes the preamble for the eight-byte encoding IR stream - * @param timestamp_pattern - * @param timestamp_pattern_syntax - * @param time_zone_id - * @param reader - * @return true on success, false otherwise - */ - bool encode_preamble (std::string_view timestamp_pattern, - std::string_view timestamp_pattern_syntax, - std::string_view time_zone_id, std::vector& reader); +namespace eight_byte_encoding { + /** + * Encodes the preamble for the eight-byte encoding IR stream + * @param timestamp_pattern + * @param timestamp_pattern_syntax + * @param time_zone_id + * @param reader + * @return true on success, false otherwise + */ + bool encode_preamble( + std::string_view timestamp_pattern, + std::string_view timestamp_pattern_syntax, + std::string_view time_zone_id, + std::vector& reader + ); - /** - * Encodes the given message into the eight-byte encoding IR stream - * @param timestamp - * @param message - * @param logtype - * @param reader - * @return true on success, false otherwise - */ - bool encode_message (epoch_time_ms_t timestamp, std::string_view message, - std::string& logtype, std::vector& reader); - } + /** + * Encodes the given message into the eight-byte encoding IR stream + * @param timestamp + * @param message + * @param logtype + * @param reader + * @return true on success, false otherwise + */ + bool encode_message( + epoch_time_ms_t timestamp, + std::string_view message, + std::string& logtype, + std::vector& reader + ); +} // namespace eight_byte_encoding - namespace four_byte_encoding { - /** - * Encodes the preamble for the four-byte encoding IR stream - * @param timestamp_pattern - * @param timestamp_pattern_syntax - * @param time_zone_id - * @param reference_timestamp - * @param reader - * @return true on success, false otherwise - */ - bool encode_preamble (std::string_view timestamp_pattern, - std::string_view timestamp_pattern_syntax, - std::string_view time_zone_id, epoch_time_ms_t reference_timestamp, - std::vector& reader); +namespace four_byte_encoding { + /** + * Encodes the preamble for the four-byte encoding IR stream + * @param timestamp_pattern + * @param timestamp_pattern_syntax + * @param time_zone_id + * @param reference_timestamp + * @param reader + * @return true on success, false otherwise + */ + bool encode_preamble( + std::string_view timestamp_pattern, + std::string_view timestamp_pattern_syntax, + std::string_view time_zone_id, + epoch_time_ms_t reference_timestamp, + std::vector& reader + ); - /** - * Encodes the given message into the four-byte encoding IR stream - * @param timestamp_delta - * @param message - * @param logtype - * @param reader - * @return true on success, false otherwise - */ - bool encode_message (epoch_time_ms_t timestamp_delta, std::string_view message, - std::string& logtype, std::vector& reader); + /** + * Encodes the given message into the four-byte encoding IR stream + * @param timestamp_delta + * @param message + * @param logtype + * @param reader + * @return true on success, false otherwise + */ + bool encode_message( + epoch_time_ms_t timestamp_delta, + std::string_view message, + std::string& logtype, + std::vector& reader + ); - /** - * Encodes the given message into the four-byte encoding IR stream - * without encoding timestamp delta - * @param message - * @param logtype - * @param reader - * @return true on success, false otherwise - */ - bool encode_message (std::string_view message, std::string& logtype, - std::vector& reader); + /** + * Encodes the given message into the four-byte encoding IR stream + * without encoding timestamp delta + * @param message + * @param logtype + * @param reader + * @return true on success, false otherwise + */ + bool + encode_message(std::string_view message, std::string& logtype, std::vector& reader); - /** - * Encodes the given timestamp delta into the four-byte encoding IR - * stream - * @param timestamp_delta - * @param reader - * @return true on success, false otherwise - */ - bool encode_timestamp (epoch_time_ms_t timestamp_delta, std::vector& reader); - } -} + /** + * Encodes the given timestamp delta into the four-byte encoding IR + * stream + * @param timestamp_delta + * @param reader + * @return true on success, false otherwise + */ + bool encode_timestamp(epoch_time_ms_t timestamp_delta, std::vector& reader); +} // namespace four_byte_encoding +} // namespace ffi::ir_stream -#endif //FFI_IR_STREAM_ENCODING_METHODS_HPP +#endif // FFI_IR_STREAM_ENCODING_METHODS_HPP From 26b13f7f15e4aff36eb1c38948ce15a4148cf80c Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 15:14:56 -0400 Subject: [PATCH 070/121] Apply clang-format to new classes --- components/core/src/BufferReader.cpp | 42 +++--- components/core/src/BufferReader.hpp | 41 +++--- components/core/src/BufferedFileReader.cpp | 125 ++++++++++-------- components/core/src/BufferedFileReader.hpp | 58 ++++---- .../core/tests/test-BufferedFileReader.cpp | 115 ++++++++-------- 5 files changed, 213 insertions(+), 168 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 23db95721..5dd8cf3b7 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -8,7 +8,7 @@ namespace { } -BufferReader::BufferReader (const char* data, size_t data_size, size_t pos) { +BufferReader::BufferReader(char const* data, size_t data_size, size_t pos) { if (nullptr == data) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -17,7 +17,7 @@ BufferReader::BufferReader (const char* data, size_t data_size, size_t pos) { m_internal_buf_pos = pos; } -ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +ErrorCode BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (nullptr == buf && num_bytes_to_read > 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -35,7 +35,7 @@ ErrorCode BufferReader::try_read (char* buf, size_t num_bytes_to_read, size_t& n return ErrorCode_Success; } -ErrorCode BufferReader::try_seek_from_begin (size_t pos) { +ErrorCode BufferReader::try_seek_from_begin(size_t pos) { if (pos > m_internal_buf_size) { return ErrorCode_Truncated; } @@ -43,13 +43,17 @@ ErrorCode BufferReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } -ErrorCode BufferReader::try_get_pos (size_t& pos) { +ErrorCode BufferReader::try_get_pos(size_t& pos) { pos = m_internal_buf_pos; return ErrorCode_Success; } -ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, bool append, - std::string& str) { +ErrorCode BufferReader::try_read_to_delimiter( + char delim, + bool keep_delimiter, + bool append, + std::string& str +) { bool found_delim; size_t num_bytes_read; if (false == append) { @@ -58,27 +62,27 @@ ErrorCode BufferReader::try_read_to_delimiter (char delim, bool keep_delimiter, return try_read_to_delimiter(delim, keep_delimiter, str, found_delim, num_bytes_read); } -void BufferReader::peek_buffer (const char*& buf, size_t& peek_size) { +void BufferReader::peek_buffer(char const*& buf, size_t& peek_size) { peek_size = get_remaining_data_size(); buf = m_internal_buf + m_internal_buf_pos; } -ErrorCode BufferReader::try_read_to_delimiter (char delim, - bool keep_delimiter, - std::string& str, - bool& found_delim, - size_t& num_bytes_read) { - +ErrorCode BufferReader::try_read_to_delimiter( + char delim, + bool keep_delimiter, + std::string& str, + bool& found_delim, + size_t& num_bytes_read +) { found_delim = false; - const auto remaining_data_size = get_remaining_data_size(); + auto const remaining_data_size = get_remaining_data_size(); if (0 == remaining_data_size) { return ErrorCode_EndOfFile; } // Find the delimiter - const char* buffer_head = m_internal_buf + m_internal_buf_pos; - const char* delim_ptr = reinterpret_cast( - memchr(buffer_head, delim, remaining_data_size) - ); + char const* buffer_head = m_internal_buf + m_internal_buf_pos; + char const* delim_ptr + = reinterpret_cast(memchr(buffer_head, delim, remaining_data_size)); size_t delim_pos; if (delim_ptr != nullptr) { @@ -96,4 +100,4 @@ ErrorCode BufferReader::try_read_to_delimiter (char delim, str.append(buffer_head, num_bytes_read); m_internal_buf_pos = delim_pos; return ErrorCode_Success; -} \ No newline at end of file +} diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 68aca3b58..4e583b3b5 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -13,18 +13,19 @@ class BufferReader : public ReaderInterface { class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} // Methods - [[nodiscard]] const char* what () const noexcept override { + [[nodiscard]] char const* what() const noexcept override { return "BufferReader operation failed"; } }; // Constructors - BufferReader (const char* data, size_t data_size) : BufferReader(data, data_size, 0) {} - BufferReader (const char* data, size_t data_size, size_t pos); + BufferReader(char const* data, size_t data_size) : BufferReader(data, data_size, 0) {} + + BufferReader(char const* data, size_t data_size, size_t pos); // Methods implementing the ReaderInterface /** @@ -36,37 +37,47 @@ class BufferReader : public ReaderInterface { * @return ErrorCode_EndOfFile if buffer doesn't contain more data * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) override; + [[nodiscard]] ErrorCode + try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; /** * Tries to seek from the beginning of the buffer to the given position * @param pos * @return ErrorCode_OutOfBounds if the given position > the buffer's size * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + [[nodiscard]] ErrorCode try_seek_from_begin(size_t pos) override; /** * @param pos Returns the position of the read head in the buffer * @return ErrorCode_Success */ - [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + [[nodiscard]] ErrorCode try_get_pos(size_t& pos) override; - [[nodiscard]] ErrorCode try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string &str) override; + [[nodiscard]] ErrorCode + try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) override; // Helper functions [[nodiscard]] size_t get_buffer_size() const { return m_internal_buf_size; } - void peek_buffer (const char*& buf, size_t& peek_size); + void peek_buffer(char const*& buf, size_t& peek_size); - ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, std::string& str, bool& found_delim, size_t& num_bytes_read); + ErrorCode try_read_to_delimiter( + char delim, + bool keep_delimiter, + std::string& str, + bool& found_delim, + size_t& num_bytes_read + ); private: // Method - [[nodiscard]] size_t get_remaining_data_size() const { return m_internal_buf_size - m_internal_buf_pos; } + [[nodiscard]] size_t get_remaining_data_size() const { + return m_internal_buf_size - m_internal_buf_pos; + } + // Variables - const char* m_internal_buf; + char const* m_internal_buf; size_t m_internal_buf_size; size_t m_internal_buf_pos; }; -#endif // BUFFERREADER_HPP +#endif // BUFFERREADER_HPP diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 39a9a44af..c9a8d21e4 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -13,32 +13,32 @@ using std::make_unique; using std::string; namespace { - ErrorCode try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) { - num_bytes_read = 0; - while (true) { - const auto bytes_read = ::read(fd, buf, num_bytes_to_read); - if (bytes_read > 0) { - buf += bytes_read; - num_bytes_read += bytes_read; - num_bytes_to_read -= bytes_read; - if (num_bytes_read == num_bytes_to_read) { - return ErrorCode_Success; - } - } else if (0 == bytes_read) { - break; - } else { - return ErrorCode_errno; +ErrorCode +try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + num_bytes_read = 0; + while (true) { + auto const bytes_read = ::read(fd, buf, num_bytes_to_read); + if (bytes_read > 0) { + buf += bytes_read; + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_read == num_bytes_to_read) { + return ErrorCode_Success; } + } else if (0 == bytes_read) { + break; + } else { + return ErrorCode_errno; } - if (0 == num_bytes_read) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; } + if (0 == num_bytes_read) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; } +} // namespace -BufferedFileReader::BufferedFileReader (size_t base_buffer_size) { +BufferedFileReader::BufferedFileReader(size_t base_buffer_size) { if (base_buffer_size % cMinBufferSize != 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -47,11 +47,11 @@ BufferedFileReader::BufferedFileReader (size_t base_buffer_size) { m_buffer = make_unique(m_buffer_size); } -BufferedFileReader::~BufferedFileReader () { +BufferedFileReader::~BufferedFileReader() { close(); } -ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { +ErrorCode BufferedFileReader::try_get_pos(size_t& pos) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -59,7 +59,7 @@ ErrorCode BufferedFileReader::try_get_pos (size_t& pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { +ErrorCode BufferedFileReader::try_seek_from_begin(size_t pos) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -96,7 +96,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return error_code; } error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); - if (ErrorCode_Success != error_code){ + if (ErrorCode_Success != error_code) { return error_code; } } @@ -107,8 +107,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin (size_t pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) { +ErrorCode +BufferedFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -121,7 +121,7 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, num_bytes_read = 0; while (true) { - size_t bytes_read {0}; + size_t bytes_read{0}; auto error_code = m_buffer_reader->try_read(buf, num_bytes_to_read, bytes_read); if (ErrorCode_Success == error_code) { buf += bytes_read; @@ -148,20 +148,27 @@ ErrorCode BufferedFileReader::try_read (char* buf, size_t num_bytes_to_read, return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, string& str) { +ErrorCode BufferedFileReader::try_read_to_delimiter( + char delim, + bool keep_delimiter, + bool append, + string& str +) { if (-1 == m_fd) { return ErrorCode_NotInit; } if (false == append) { str.clear(); } - bool found_delim {false}; - size_t total_append_length {0}; + bool found_delim{false}; + size_t total_append_length{0}; while (false == found_delim) { - size_t length {0}; - if (auto ret_code = m_buffer_reader->try_read_to_delimiter(delim, keep_delimiter, str, found_delim, length); - ret_code != ErrorCode_Success && ret_code != ErrorCode_EndOfFile) { + size_t length{0}; + if (auto ret_code + = m_buffer_reader + ->try_read_to_delimiter(delim, keep_delimiter, str, found_delim, length); + ret_code != ErrorCode_Success && ret_code != ErrorCode_EndOfFile) + { return ret_code; } update_file_pos(m_file_pos + length); @@ -182,7 +189,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter (char delim, bool keep_delim return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_open (const string& path) { +ErrorCode BufferedFileReader::try_open(string const& path) { // Cleanup in case caller forgot to call close before calling this function close(); @@ -201,19 +208,23 @@ ErrorCode BufferedFileReader::try_open (const string& path) { return ErrorCode_Success; } -void BufferedFileReader::open (const string& path) { +void BufferedFileReader::open(string const& path) { ErrorCode error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - throw OperationFailedWithMsg(error_code, __FILENAME__, __LINE__, - "File not found: " + boost::filesystem::weakly_canonical(path).string()); + throw OperationFailedWithMsg( + error_code, + __FILENAME__, + __LINE__, + "File not found: " + boost::filesystem::weakly_canonical(path).string() + ); } else { throw OperationFailed(error_code, __FILENAME__, __LINE__); } } } -void BufferedFileReader::close () { +void BufferedFileReader::close() { if (-1 != m_fd) { // NOTE: We don't check errors for fclose since it seems // the only reason it could fail is if it was interrupted by a signal @@ -240,7 +251,7 @@ size_t BufferedFileReader::set_checkpoint() { return m_file_pos; } -void BufferedFileReader::clear_checkpoint () { +void BufferedFileReader::clear_checkpoint() { if (false == m_checkpoint_pos.has_value()) { return; } @@ -250,8 +261,7 @@ void BufferedFileReader::clear_checkpoint () { m_checkpoint_pos.reset(); } -ErrorCode BufferedFileReader::peek_buffered_data (const char*& buf, - size_t& peek_size) { +ErrorCode BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -268,19 +278,19 @@ ErrorCode BufferedFileReader::peek_buffered_data (const char*& buf, return ErrorCode_Success; } -size_t BufferedFileReader::quantize_to_buffer_size (size_t size) const { +size_t BufferedFileReader::quantize_to_buffer_size(size_t size) const { if (size == 0) { return 0; } return (1 + ((size - 1) / m_base_buffer_size)) * m_base_buffer_size; } -ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) { +ErrorCode BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) { size_t num_bytes_refilled = 0; - const auto buffer_end_pos = get_buffer_end_pos(); - const auto data_size = m_buffer_reader->get_buffer_size(); - const auto available_buffer_space = m_buffer_size - data_size; + auto const buffer_end_pos = get_buffer_end_pos(); + auto const data_size = m_buffer_reader->get_buffer_size(); + auto const available_buffer_space = m_buffer_size - data_size; size_t buf_internal_pos; size_t bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); @@ -307,21 +317,26 @@ ErrorCode BufferedFileReader::refill_reader_buffer (size_t num_bytes_to_refill) } } - auto error_code = try_read_into_buffer(m_fd, m_buffer.get() + buf_internal_pos, - bytes_to_read, num_bytes_refilled); + auto error_code = try_read_into_buffer( + m_fd, + m_buffer.get() + buf_internal_pos, + bytes_to_read, + num_bytes_refilled + ); if (error_code != ErrorCode_Success) { return error_code; } - m_buffer_reader.emplace(m_buffer.get(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); + m_buffer_reader + .emplace(m_buffer.get(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); return ErrorCode_Success; } -void BufferedFileReader::resize_buffer_from_pos (size_t pos) { +void BufferedFileReader::resize_buffer_from_pos(size_t pos) { if (pos > m_buffer_reader->get_buffer_size()) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - const auto copy_size = m_buffer_reader->get_buffer_size() - pos; + auto const copy_size = m_buffer_reader->get_buffer_size() - pos; // Use a quantized size for the underlying buffer size m_buffer_size = quantize_to_buffer_size(copy_size); auto new_buffer = make_unique(m_buffer_size); @@ -332,7 +347,7 @@ void BufferedFileReader::resize_buffer_from_pos (size_t pos) { m_buffer_reader.emplace(m_buffer.get(), copy_size); } -void BufferedFileReader::update_file_pos (size_t pos) { +void BufferedFileReader::update_file_pos(size_t pos) { m_file_pos = pos; m_highest_read_pos = std::max(m_file_pos, m_highest_read_pos); -} \ No newline at end of file +} diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 27ff28700..2e22815fd 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -29,11 +29,11 @@ class BufferedFileReader : public ReaderInterface { class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} // Methods - [[nodiscard]] const char* what () const noexcept override { + [[nodiscard]] char const* what() const noexcept override { return "BufferedFileReader operation failed"; } }; @@ -41,20 +41,29 @@ class BufferedFileReader : public ReaderInterface { class OperationFailedWithMsg : public TraceableException { public: // Constructors - OperationFailedWithMsg (ErrorCode error_code, const char* const filename, int line_number, std::string message) : - TraceableException (error_code, filename, line_number), m_message(message) {} + OperationFailedWithMsg( + ErrorCode error_code, + char const* const filename, + int line_number, + std::string message + ) + : TraceableException(error_code, filename, line_number), + m_message(message) {} // Methods - [[nodiscard]] const char* what () const noexcept override { + [[nodiscard]] char const* what() const noexcept override { return "BufferedFileReader operation failed"; } + private: std::string m_message; }; // Constructors BufferedFileReader(size_t base_buffer_size); + BufferedFileReader() : BufferedFileReader(cDefaultBufferSize) {} + ~BufferedFileReader(); // Methods implementing the ReaderInterface @@ -65,7 +74,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_get_pos (size_t& pos) override; + [[nodiscard]] ErrorCode try_get_pos(size_t& pos) override; /** * Tries to seek from the beginning of the file to the given position * @param pos @@ -73,7 +82,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_seek_from_begin (size_t pos) override; + [[nodiscard]] ErrorCode try_seek_from_begin(size_t pos) override; /** * Tries to read up to a given number of bytes from the file @@ -86,8 +95,8 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_read (char* buf, size_t num_bytes_to_read, - size_t& num_bytes_read) override; + [[nodiscard]] ErrorCode + try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; /** * Tries to read a string from the file until it reaches @@ -102,11 +111,11 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, - bool append, std::string& str) override; + [[nodiscard]] ErrorCode + try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) override; // Methods - [[nodiscard]] bool is_open () const { return -1 != m_fd; } + [[nodiscard]] bool is_open() const { return -1 != m_fd; } /** * Tries to open a file @@ -115,19 +124,19 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_FileNotFound if the file was not found * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode try_open (const std::string& path); + [[nodiscard]] ErrorCode try_open(std::string const& path); /** * Opens a file * @param path * @throw BufferedFileReader::OperationFailed on failure */ - void open (const std::string& path); + void open(std::string const& path); /** * Closes the file if it's open */ - void close (); + void close(); - [[nodiscard]] const std::string& get_path () const { return m_path; } + [[nodiscard]] std::string const& get_path() const { return m_path; } /** * Peeks the buffer without advancing the file @@ -142,7 +151,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_NotInit if the file is not opened * @return ErrorCode_EndOfFile if already reaching the eof */ - [[nodiscard]] ErrorCode peek_buffered_data(const char*& data_ptr, size_t& peek_size); + [[nodiscard]] ErrorCode peek_buffered_data(char const*& data_ptr, size_t& peek_size); /** * Sets a checkpoint at the current file pos. @@ -176,7 +185,7 @@ class BufferedFileReader : public ReaderInterface { * 'm_buffer_size' using the rounding method. This ensures that the current * read pos still resides in the resized buffer */ - void clear_checkpoint (); + void clear_checkpoint(); private: // Methods @@ -212,9 +221,13 @@ class BufferedFileReader : public ReaderInterface { * @param file_pos * @return */ - [[nodiscard]] size_t get_buffer_relative_pos(size_t file_pos) const { return file_pos - m_buffer_begin_pos; } + [[nodiscard]] size_t get_buffer_relative_pos(size_t file_pos) const { + return file_pos - m_buffer_begin_pos; + } - [[nodiscard]] size_t get_buffer_end_pos() const { return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); } + [[nodiscard]] size_t get_buffer_end_pos() const { + return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); + } void update_file_pos(size_t pos); @@ -238,7 +251,6 @@ class BufferedFileReader : public ReaderInterface { // Variables for checkpoint support std::optional m_checkpoint_pos; size_t m_highest_read_pos{0}; - }; -#endif // BUFFEREDFILEREADER_HPP +#endif // BUFFEREDFILEREADER_HPP diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 687e45bef..287e26c9a 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -8,20 +8,19 @@ #include "../submodules/Catch2/single_include/catch2/catch.hpp" // Project headers -#include "../src/FileWriter.hpp" #include "../src/BufferedFileReader.hpp" +#include "../src/FileWriter.hpp" TEST_CASE("Test reading data", "[BufferedFileReader]") { - // Initialize data for testing - size_t test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 + size_t test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 char* test_data = new char[test_data_size]; char* read_buffer = new char[test_data_size]; for (size_t i = 0; i < test_data_size; ++i) { test_data[i] = (char)('a' + (i % 26)); } - std::string test_file_path {"BufferedFileReader.test"}; + std::string test_file_path{"BufferedFileReader.test"}; // write to test file FileWriter file_writer; file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); @@ -31,30 +30,30 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { SECTION("General read testing") { BufferedFileReader file_reader; file_reader.open(test_file_path); - size_t num_bytes_read {0}; - size_t buffer_offset {0}; + size_t num_bytes_read{0}; + size_t buffer_offset{0}; // first, read a small chunk of data; - size_t read_size1 {1023}; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size1, - num_bytes_read)); + size_t read_size1{1023}; + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer + buffer_offset, read_size1, num_bytes_read)); REQUIRE(read_size1 == num_bytes_read); REQUIRE(0 == memcmp(read_buffer, test_data, read_size1)); buffer_offset += num_bytes_read; // second, read a large chunk of data, so // BufferedFileReader will refill the internal buffer - size_t read_size2 {65538}; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size2, - num_bytes_read)); + size_t read_size2{65'538}; + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer + buffer_offset, read_size2, num_bytes_read)); REQUIRE(read_size2 == num_bytes_read); REQUIRE(0 == memcmp(read_buffer, test_data, read_size1 + read_size2)); buffer_offset += num_bytes_read; // third, read remaining data size_t read_size3 = test_data_size - read_size2 - read_size1; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer + buffer_offset, read_size3, - num_bytes_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer + buffer_offset, read_size3, num_bytes_read)); REQUIRE(read_size3 == num_bytes_read); buffer_offset += num_bytes_read; @@ -62,8 +61,8 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { // lastly, make sure the file reaches eof size_t read_size4 = 1; - REQUIRE(ErrorCode_EndOfFile == file_reader.try_read(read_buffer + buffer_offset, - read_size4, num_bytes_read)); + REQUIRE(ErrorCode_EndOfFile + == file_reader.try_read(read_buffer + buffer_offset, read_size4, num_bytes_read)); file_reader.close(); } @@ -72,20 +71,20 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { file_reader.open(test_file_path); // seek to some random position and do a read - size_t seek_pos1 {245}; - size_t num_bytes_to_read {65540}; + size_t seek_pos1{245}; + size_t num_bytes_to_read{65'540}; size_t num_byte_read; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos1)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos1, num_bytes_to_read)); // seek front to random position and do a read - size_t seek_pos2 {345212}; + size_t seek_pos2{345'212}; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos2)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos2, num_bytes_to_read)); @@ -100,40 +99,40 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { size_t num_byte_read; // first, read some data to advance the file_pos - size_t num_bytes_to_read_1 = 65540; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_1, - num_byte_read)); + size_t num_bytes_to_read_1 = 65'540; + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read_1, num_byte_read)); REQUIRE(file_reader.get_pos() == num_bytes_to_read_1); // set a checkpoint size_t checkpoint_pos = file_reader.set_checkpoint(); // keep reading some data - size_t num_bytes_to_read_2 = 345212; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_2, - num_byte_read)); + size_t num_bytes_to_read_2 = 345'212; + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); REQUIRE(file_reader.get_pos() == num_bytes_to_read_1 + num_bytes_to_read_2); size_t latest_file_pos = file_reader.get_pos(); // now seek back to some where between size_t seek_pos_1 = checkpoint_pos + 500; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos_1)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_2, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos_1, num_bytes_to_read_2)); // update the latest_file_pos if necessary latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); // now try to seek back to a pos that's before the checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos-1)); + REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos - 1)); // now go back to latest data REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); // make sure data read after latest_file_pos size_t num_bytes_to_read_3 = 4096; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_3, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read_3, num_byte_read)); REQUIRE(num_bytes_to_read_3 == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read_3)); // update the latest_file_pos @@ -147,18 +146,21 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { // make sure data read after checkpoint-set are still correct size_t num_bytes_to_read_4 = 4096; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read_4, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read_4, num_byte_read)); REQUIRE(num_bytes_to_read_4 == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + (latest_file_pos + checkpoint_pos) / 2, - num_bytes_to_read_4)); - + REQUIRE(0 + == memcmp( + read_buffer, + test_data + (latest_file_pos + checkpoint_pos) / 2, + num_bytes_to_read_4 + )); file_reader.clear_checkpoint(); - size_t default_buffer_size = 65536; + size_t default_buffer_size = 65'536; // make sure data read after checkpoint-reset are still correct; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, default_buffer_size, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, default_buffer_size, num_byte_read)); REQUIRE(default_buffer_size == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, default_buffer_size)); } @@ -168,7 +170,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { file_reader.open(test_file_path); // first, advance to some random file_pos - size_t begin_read_pos = 45313; + size_t begin_read_pos = 45'313; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(begin_read_pos)); // set a checkpoint @@ -178,17 +180,17 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { size_t num_bytes_to_read; size_t num_byte_read; - num_bytes_to_read = 345212; - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); + num_bytes_to_read = 345'212; + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); REQUIRE(0 == memcmp(read_buffer, test_data + begin_read_pos, num_bytes_to_read)); // now seek back to some where between size_t seek_pos = file_reader.get_pos() / 2; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); @@ -196,8 +198,8 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { num_bytes_to_read = 500; seek_pos = test_data_size - num_bytes_to_read; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); - REQUIRE(ErrorCode_Success == file_reader.try_read(read_buffer, num_bytes_to_read, - num_byte_read)); + REQUIRE(ErrorCode_Success + == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); REQUIRE(num_bytes_to_read == num_byte_read); REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); } @@ -207,17 +209,17 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { } #include "../src/FileReader.hpp" -TEST_CASE("Test delimiter", "[BufferedFileReader]") { +TEST_CASE("Test delimiter", "[BufferedFileReader]") { // Initialize data for testing - size_t test_data_size = 1L * 1024 * 1024; // 1MB + size_t test_data_size = 1L * 1024 * 1024; // 1MB char* test_data = new char[test_data_size]; std::srand(0); for (size_t i = 0; i < test_data_size; ++i) { test_data[i] = (char)('a' + (std::rand() % 26)); } - std::string test_file_path {"BufferedFileReader.delimiter.test"}; + std::string test_file_path{"BufferedFileReader.delimiter.test"}; // write to test file FileWriter file_writer; file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); @@ -234,10 +236,11 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { ErrorCode error_code = ErrorCode_Success; char delimiter = (char)('a' + (std::rand() % 26)); - while(ErrorCode_EndOfFile != error_code) { + while (ErrorCode_EndOfFile != error_code) { error_code = file_reader.try_read_to_delimiter(delimiter, true, false, ref_string); - auto error_code2 = buffered_file_reader.try_read_to_delimiter(delimiter, true, false, test_string); + auto error_code2 + = buffered_file_reader.try_read_to_delimiter(delimiter, true, false, test_string); REQUIRE(error_code2 == error_code); REQUIRE(test_string == ref_string); } -} \ No newline at end of file +} From 426dea30b1a63dbf28ccac16696a1253e5a8c280 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 16:30:07 -0400 Subject: [PATCH 071/121] Small cleanup --- components/core/tests/test-BufferedFileReader.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 287e26c9a..0234b05a7 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -1,6 +1,3 @@ -// C libraries -#include - // Boost libraries #include @@ -114,7 +111,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(file_reader.get_pos() == num_bytes_to_read_1 + num_bytes_to_read_2); size_t latest_file_pos = file_reader.get_pos(); - // now seek back to some where between + // now seek back to somewhere between size_t seek_pos_1 = checkpoint_pos + 500; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos_1)); REQUIRE(ErrorCode_Success @@ -186,7 +183,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); REQUIRE(0 == memcmp(read_buffer, test_data + begin_read_pos, num_bytes_to_read)); - // now seek back to some where between + // now seek back to somewhere between size_t seek_pos = file_reader.get_pos() / 2; REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); REQUIRE(ErrorCode_Success From afff981846b186f244ea2b5c302f6db056c4f7da Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 21:23:43 -0400 Subject: [PATCH 072/121] Some refactor for clang-tidy --- components/core/src/BufferReader.cpp | 33 ++++++-------- components/core/src/BufferReader.hpp | 25 +++++----- components/core/src/BufferedFileReader.cpp | 50 ++++++++++---------- components/core/src/BufferedFileReader.hpp | 53 ++++++++++++---------- 4 files changed, 84 insertions(+), 77 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 5dd8cf3b7..76bb3b11a 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -4,10 +4,6 @@ #include #include -namespace { - -} - BufferReader::BufferReader(char const* data, size_t data_size, size_t pos) { if (nullptr == data) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); @@ -17,7 +13,8 @@ BufferReader::BufferReader(char const* data, size_t data_size, size_t pos) { m_internal_buf_pos = pos; } -ErrorCode BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +auto BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode { if (nullptr == buf && num_bytes_to_read > 0) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -28,14 +25,14 @@ ErrorCode BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu } num_bytes_read = std::min(remaining_data_size, num_bytes_to_read); - auto copy_begin = m_internal_buf + m_internal_buf_pos; - auto copy_end = copy_begin + num_bytes_read; + const auto* copy_begin = m_internal_buf + m_internal_buf_pos; + const auto* copy_end = copy_begin + num_bytes_read; std::copy(copy_begin, copy_end, buf); m_internal_buf_pos += num_bytes_read; return ErrorCode_Success; } -ErrorCode BufferReader::try_seek_from_begin(size_t pos) { +auto BufferReader::try_seek_from_begin(size_t pos) -> ErrorCode { if (pos > m_internal_buf_size) { return ErrorCode_Truncated; } @@ -43,37 +40,37 @@ ErrorCode BufferReader::try_seek_from_begin(size_t pos) { return ErrorCode_Success; } -ErrorCode BufferReader::try_get_pos(size_t& pos) { +auto BufferReader::try_get_pos(size_t& pos) -> ErrorCode { pos = m_internal_buf_pos; return ErrorCode_Success; } -ErrorCode BufferReader::try_read_to_delimiter( +auto BufferReader::try_read_to_delimiter( char delim, bool keep_delimiter, bool append, std::string& str -) { - bool found_delim; - size_t num_bytes_read; +) -> ErrorCode { + bool found_delim{false}; + size_t num_bytes_read{0}; if (false == append) { str.clear(); } return try_read_to_delimiter(delim, keep_delimiter, str, found_delim, num_bytes_read); } -void BufferReader::peek_buffer(char const*& buf, size_t& peek_size) { +auto BufferReader::peek_buffer(char const*& buf, size_t& peek_size) -> void { peek_size = get_remaining_data_size(); buf = m_internal_buf + m_internal_buf_pos; } -ErrorCode BufferReader::try_read_to_delimiter( +auto BufferReader::try_read_to_delimiter( char delim, bool keep_delimiter, std::string& str, bool& found_delim, size_t& num_bytes_read -) { +) -> ErrorCode { found_delim = false; auto const remaining_data_size = get_remaining_data_size(); if (0 == remaining_data_size) { @@ -82,9 +79,9 @@ ErrorCode BufferReader::try_read_to_delimiter( // Find the delimiter char const* buffer_head = m_internal_buf + m_internal_buf_pos; char const* delim_ptr - = reinterpret_cast(memchr(buffer_head, delim, remaining_data_size)); + = static_cast(memchr(buffer_head, delim, remaining_data_size)); - size_t delim_pos; + size_t delim_pos{0}; if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_internal_buf) + 1; num_bytes_read = delim_pos - m_internal_buf_pos; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 4e583b3b5..5b7cacea4 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -17,7 +17,7 @@ class BufferReader : public ReaderInterface { : TraceableException(error_code, filename, line_number) {} // Methods - [[nodiscard]] char const* what() const noexcept override { + [[nodiscard]] auto what() const noexcept -> char const* override { return "BufferReader operation failed"; } }; @@ -37,40 +37,41 @@ class BufferReader : public ReaderInterface { * @return ErrorCode_EndOfFile if buffer doesn't contain more data * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode - try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + [[nodiscard]] auto try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode override; /** * Tries to seek from the beginning of the buffer to the given position * @param pos * @return ErrorCode_OutOfBounds if the given position > the buffer's size * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_seek_from_begin(size_t pos) override; + [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; /** * @param pos Returns the position of the read head in the buffer * @return ErrorCode_Success */ - [[nodiscard]] ErrorCode try_get_pos(size_t& pos) override; + [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override; - [[nodiscard]] ErrorCode - try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) override; + [[nodiscard]] auto + try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) + -> ErrorCode override; // Helper functions - [[nodiscard]] size_t get_buffer_size() const { return m_internal_buf_size; } + [[nodiscard]] auto get_buffer_size() const -> size_t { return m_internal_buf_size; } - void peek_buffer(char const*& buf, size_t& peek_size); + auto peek_buffer(char const*& buf, size_t& peek_size) -> void; - ErrorCode try_read_to_delimiter( + auto try_read_to_delimiter( char delim, bool keep_delimiter, std::string& str, bool& found_delim, size_t& num_bytes_read - ); + ) -> ErrorCode; private: // Method - [[nodiscard]] size_t get_remaining_data_size() const { + [[nodiscard]] auto get_remaining_data_size() const -> size_t { return m_internal_buf_size - m_internal_buf_pos; } diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index c9a8d21e4..509398986 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -13,8 +13,8 @@ using std::make_unique; using std::string; namespace { -ErrorCode -try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +auto try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode { num_bytes_read = 0; while (true) { auto const bytes_read = ::read(fd, buf, num_bytes_to_read); @@ -51,7 +51,7 @@ BufferedFileReader::~BufferedFileReader() { close(); } -ErrorCode BufferedFileReader::try_get_pos(size_t& pos) { +auto BufferedFileReader::try_get_pos(size_t& pos) -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -59,7 +59,7 @@ ErrorCode BufferedFileReader::try_get_pos(size_t& pos) { return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_seek_from_begin(size_t pos) { +auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -79,7 +79,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin(size_t pos) { auto error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); if (ErrorCode_Truncated == error_code) { if (false == m_checkpoint_pos.has_value()) { - // if checkpoint is not set, simply move the file_pos and invalidate the buffer reader + // if checkpoint is not set, simply move the file_pos and invalidate + // the buffer reader auto offset = lseek(m_fd, static_cast<__off64_t>(pos), SEEK_SET); if (-1 == offset) { return ErrorCode_errno; @@ -87,7 +88,7 @@ ErrorCode BufferedFileReader::try_seek_from_begin(size_t pos) { m_buffer_reader.emplace(m_buffer.get(), 0); m_buffer_begin_pos = pos; } else { - size_t num_bytes_to_refill = pos - get_buffer_end_pos(); + auto const num_bytes_to_refill = pos - get_buffer_end_pos(); error_code = refill_reader_buffer(num_bytes_to_refill); if (ErrorCode_EndOfFile == error_code) { return ErrorCode_Truncated; @@ -107,8 +108,8 @@ ErrorCode BufferedFileReader::try_seek_from_begin(size_t pos) { return ErrorCode_Success; } -ErrorCode -BufferedFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { +auto BufferedFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -138,7 +139,8 @@ BufferedFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_by error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { break; - } else if (ErrorCode_Success != error_code) { + } + if (ErrorCode_Success != error_code) { return error_code; } } @@ -148,12 +150,12 @@ BufferedFileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_by return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_read_to_delimiter( +auto BufferedFileReader::try_read_to_delimiter( char delim, bool keep_delimiter, bool append, string& str -) { +) -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -181,7 +183,8 @@ ErrorCode BufferedFileReader::try_read_to_delimiter( return ErrorCode_EndOfFile; } return ErrorCode_Success; - } else if (ErrorCode_Success != error_code) { + } + if (ErrorCode_Success != error_code) { return error_code; } } @@ -189,7 +192,7 @@ ErrorCode BufferedFileReader::try_read_to_delimiter( return ErrorCode_Success; } -ErrorCode BufferedFileReader::try_open(string const& path) { +auto BufferedFileReader::try_open(string const& path) -> ErrorCode { // Cleanup in case caller forgot to call close before calling this function close(); @@ -209,7 +212,7 @@ ErrorCode BufferedFileReader::try_open(string const& path) { } void BufferedFileReader::open(string const& path) { - ErrorCode error_code = try_open(path); + auto const error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { throw OperationFailedWithMsg( @@ -218,9 +221,8 @@ void BufferedFileReader::open(string const& path) { __LINE__, "File not found: " + boost::filesystem::weakly_canonical(path).string() ); - } else { - throw OperationFailed(error_code, __FILENAME__, __LINE__); } + throw OperationFailed(error_code, __FILENAME__, __LINE__); } } @@ -239,7 +241,7 @@ void BufferedFileReader::close() { } } -size_t BufferedFileReader::set_checkpoint() { +auto BufferedFileReader::set_checkpoint() -> size_t { if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos) { if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { // allocate new buffer for buffered data starting from pos @@ -251,7 +253,7 @@ size_t BufferedFileReader::set_checkpoint() { return m_file_pos; } -void BufferedFileReader::clear_checkpoint() { +auto BufferedFileReader::clear_checkpoint() -> void { if (false == m_checkpoint_pos.has_value()) { return; } @@ -261,7 +263,7 @@ void BufferedFileReader::clear_checkpoint() { m_checkpoint_pos.reset(); } -ErrorCode BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) { +auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } @@ -278,20 +280,20 @@ ErrorCode BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_ return ErrorCode_Success; } -size_t BufferedFileReader::quantize_to_buffer_size(size_t size) const { +auto BufferedFileReader::quantize_to_buffer_size(size_t size) const -> size_t { if (size == 0) { return 0; } return (1 + ((size - 1) / m_base_buffer_size)) * m_base_buffer_size; } -ErrorCode BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) { +auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> ErrorCode { size_t num_bytes_refilled = 0; auto const buffer_end_pos = get_buffer_end_pos(); auto const data_size = m_buffer_reader->get_buffer_size(); auto const available_buffer_space = m_buffer_size - data_size; - size_t buf_internal_pos; + size_t buf_internal_pos{0}; size_t bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); if (m_checkpoint_pos.has_value()) { @@ -331,7 +333,7 @@ ErrorCode BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) { return ErrorCode_Success; } -void BufferedFileReader::resize_buffer_from_pos(size_t pos) { +auto BufferedFileReader::resize_buffer_from_pos(size_t pos) -> void { if (pos > m_buffer_reader->get_buffer_size()) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -347,7 +349,7 @@ void BufferedFileReader::resize_buffer_from_pos(size_t pos) { m_buffer_reader.emplace(m_buffer.get(), copy_size); } -void BufferedFileReader::update_file_pos(size_t pos) { +auto BufferedFileReader::update_file_pos(size_t pos) -> void { m_file_pos = pos; m_highest_read_pos = std::max(m_file_pos, m_highest_read_pos); } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 2e22815fd..9aa7bd395 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -33,7 +33,7 @@ class BufferedFileReader : public ReaderInterface { : TraceableException(error_code, filename, line_number) {} // Methods - [[nodiscard]] char const* what() const noexcept override { + [[nodiscard]] auto what() const noexcept -> char const* override { return "BufferedFileReader operation failed"; } }; @@ -48,10 +48,10 @@ class BufferedFileReader : public ReaderInterface { std::string message ) : TraceableException(error_code, filename, line_number), - m_message(message) {} + m_message(std::move(message)) {} // Methods - [[nodiscard]] char const* what() const noexcept override { + [[nodiscard]] auto what() const noexcept -> char const* override { return "BufferedFileReader operation failed"; } @@ -66,6 +66,12 @@ class BufferedFileReader : public ReaderInterface { ~BufferedFileReader(); + // explicitly disable copy or move + BufferedFileReader(BufferedFileReader const&) = delete; + BufferedFileReader(BufferedFileReader&&) = delete; + auto operator=(BufferedFileReader) -> BufferedFileReader& = delete; + auto operator=(BufferedFileReader&&) -> BufferedFileReader& = delete; + // Methods implementing the ReaderInterface /** * Tries to get the current position of the read head in the file @@ -74,7 +80,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_get_pos(size_t& pos) override; + [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override; /** * Tries to seek from the beginning of the file to the given position * @param pos @@ -82,7 +88,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_errno on error * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_seek_from_begin(size_t pos) override; + [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; /** * Tries to read up to a given number of bytes from the file @@ -95,8 +101,8 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode - try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + [[nodiscard]] auto try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode override; /** * Tries to read a string from the file until it reaches @@ -111,11 +117,12 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode - try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) override; + [[nodiscard]] auto + try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) + -> ErrorCode override; // Methods - [[nodiscard]] bool is_open() const { return -1 != m_fd; } + [[nodiscard]] auto is_open() const -> bool { return -1 != m_fd; } /** * Tries to open a file @@ -124,19 +131,19 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_FileNotFound if the file was not found * @return ErrorCode_errno otherwise */ - [[nodiscard]] ErrorCode try_open(std::string const& path); + [[nodiscard]] auto try_open(std::string const& path) -> ErrorCode; /** * Opens a file * @param path * @throw BufferedFileReader::OperationFailed on failure */ - void open(std::string const& path); + auto open(std::string const& path) -> void; /** * Closes the file if it's open */ - void close(); + auto close() -> void; - [[nodiscard]] std::string const& get_path() const { return m_path; } + [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } /** * Peeks the buffer without advancing the file @@ -151,7 +158,7 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_NotInit if the file is not opened * @return ErrorCode_EndOfFile if already reaching the eof */ - [[nodiscard]] ErrorCode peek_buffered_data(char const*& data_ptr, size_t& peek_size); + [[nodiscard]] auto peek_buffered_data(char const*& data_ptr, size_t& peek_size) -> ErrorCode; /** * Sets a checkpoint at the current file pos. @@ -171,7 +178,7 @@ class BufferedFileReader : public ReaderInterface { * in the buffer. * @return current file pos */ - size_t set_checkpoint(); + auto set_checkpoint() -> size_t; /** * Disable the checkpoint pos and release buffered data from memory @@ -185,7 +192,7 @@ class BufferedFileReader : public ReaderInterface { * 'm_buffer_size' using the rounding method. This ensures that the current * read pos still resides in the resized buffer */ - void clear_checkpoint(); + auto clear_checkpoint() -> void; private: // Methods @@ -194,7 +201,7 @@ class BufferedFileReader : public ReaderInterface { * @param size * @return quantized size */ - [[nodiscard]] size_t quantize_to_buffer_size(size_t size) const; + [[nodiscard]] auto quantize_to_buffer_size(size_t size) const -> size_t; /** * Reads next refill_size bytes from file descriptor to the internal buffer @@ -207,29 +214,29 @@ class BufferedFileReader : public ReaderInterface { * @return ErrorCode_NotInit if the file is not opened * @return ErrorCode_EndOfFile if already reaching the eof */ - [[nodiscard]] ErrorCode refill_reader_buffer(size_t refill_size); + [[nodiscard]] auto refill_reader_buffer(size_t refill_size) -> ErrorCode; /** * Resize the internal reader buffer and copy over data from the original * buffer staring from pos to the beginning of the resized the buffer * @param pos */ - void resize_buffer_from_pos(size_t pos); + auto resize_buffer_from_pos(size_t pos) -> void; /** * return the file_pos's corresponding pos in the internal buffer * @param file_pos * @return */ - [[nodiscard]] size_t get_buffer_relative_pos(size_t file_pos) const { + [[nodiscard]] auto get_buffer_relative_pos(size_t file_pos) const -> size_t { return file_pos - m_buffer_begin_pos; } - [[nodiscard]] size_t get_buffer_end_pos() const { + [[nodiscard]] auto get_buffer_end_pos() const -> size_t { return m_buffer_begin_pos + m_buffer_reader->get_buffer_size(); } - void update_file_pos(size_t pos); + auto update_file_pos(size_t pos) -> void; // Constants static constexpr size_t cMinBufferSize = (1ULL << 12); From 4ca5bd89e440035d9e2076fa9b3c53f4c5a20c01 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 21:55:21 -0400 Subject: [PATCH 073/121] Run clang-format on unit-test --- .../core/tests/test-ir_encoding_methods.cpp | 404 ++++++++++-------- 1 file changed, 235 insertions(+), 169 deletions(-) diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 2e0437b18..3c896e908 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -5,11 +5,11 @@ #include // Project headers +#include "../src/BufferReader.hpp" #include "../src/ffi/encoding_methods.hpp" -#include "../src/ffi/ir_stream/encoding_methods.hpp" #include "../src/ffi/ir_stream/decoding_methods.hpp" +#include "../src/ffi/ir_stream/encoding_methods.hpp" #include "../src/ffi/ir_stream/protocol_constants.hpp" -#include "../src/BufferReader.hpp" using ffi::decode_float_var; using ffi::decode_integer_var; @@ -24,21 +24,21 @@ using ffi::get_bounds_of_next_var; using ffi::ir_stream::cProtocol::EightByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::FourByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::MagicNumberLength; -using ffi::ir_stream::get_encoding_type; using ffi::ir_stream::decode_preamble; -using ffi::ir_stream::IRErrorCode; using ffi::ir_stream::encoded_tag_t; +using ffi::ir_stream::get_encoding_type; +using ffi::ir_stream::IRErrorCode; using ffi::VariablePlaceholder; using ffi::wildcard_query_matches_any_encoded_var; using std::chrono::duration_cast; using std::chrono::milliseconds; using std::chrono::system_clock; using std::is_same_v; -using std::string_view; using std::string; +using std::string_view; using std::vector; -static epoch_time_ms_t get_current_ts (); +static epoch_time_ms_t get_current_ts(); /** * @tparam encoded_variable_t Type of the encoded variable @@ -47,10 +47,10 @@ static epoch_time_ms_t get_current_ts (); * false otherwise */ template -bool match_encoding_type (bool is_four_bytes_encoding); +bool match_encoding_type(bool is_four_bytes_encoding); template -epoch_time_ms_t get_next_timestamp_for_test (); +epoch_time_ms_t get_next_timestamp_for_test(); /** * Helper function that encodes a preamble of encoding type = encoded_variable_t @@ -65,9 +65,13 @@ epoch_time_ms_t get_next_timestamp_for_test (); * @return True if preamble is encoded without error, otherwise false */ template -bool encode_preamble (string_view timestamp_pattern, - string_view timestamp_pattern_syntax, string_view time_zone_id, - epoch_time_ms_t reference_timestamp, vector& ir_buf); +bool encode_preamble( + string_view timestamp_pattern, + string_view timestamp_pattern_syntax, + string_view time_zone_id, + epoch_time_ms_t reference_timestamp, + vector& ir_buf +); /** * Helper function that encodes a message of encoding type = encoded_variable_t @@ -80,8 +84,12 @@ bool encode_preamble (string_view timestamp_pattern, * @return True if message is encoded without error, otherwise false */ template -bool encode_message (epoch_time_ms_t timestamp, string_view message, string& logtype, - vector& ir_buf); +bool encode_message( + epoch_time_ms_t timestamp, + string_view message, + string& logtype, + vector& ir_buf +); /** * Helper function that decodes a message of encoding type = encoded_variable_t @@ -97,7 +105,7 @@ bool encode_message (epoch_time_ms_t timestamp, string_view message, string& log * encoded_variable_t == four_byte_encoded_variable_t */ template -IRErrorCode decode_next_message (BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts); +IRErrorCode decode_next_message(BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts); /** * Struct to hold the timestamp info from the IR stream's metadata @@ -113,14 +121,14 @@ struct TimestampInfo { * @param metadata_json The JSON metadata * @param ts_info Returns the timestamp info */ -static void set_timestamp_info (const nlohmann::json& metadata_json, TimestampInfo& ts_info); +static void set_timestamp_info(nlohmann::json const& metadata_json, TimestampInfo& ts_info); -static epoch_time_ms_t get_current_ts () { +static epoch_time_ms_t get_current_ts() { return duration_cast(system_clock::now().time_since_epoch()).count(); } template -bool match_encoding_type (bool is_four_bytes_encoding) { +bool match_encoding_type(bool is_four_bytes_encoding) { static_assert(is_same_v || is_same_v); @@ -132,7 +140,7 @@ bool match_encoding_type (bool is_four_bytes_encoding) { } template -epoch_time_ms_t get_next_timestamp_for_test () { +epoch_time_ms_t get_next_timestamp_for_test() { static_assert(is_same_v || is_same_v); @@ -150,129 +158,161 @@ epoch_time_ms_t get_next_timestamp_for_test () { // A helper function to generalize the testing caller interface. // The reference_timestamp is only used by four bytes encoding template -bool encode_preamble (string_view timestamp_pattern, - string_view timestamp_pattern_syntax, string_view time_zone_id, - epoch_time_ms_t reference_timestamp, vector& ir_buf) { +bool encode_preamble( + string_view timestamp_pattern, + string_view timestamp_pattern_syntax, + string_view time_zone_id, + epoch_time_ms_t reference_timestamp, + vector& ir_buf +) { static_assert(is_same_v || is_same_v); if constexpr (is_same_v) { - return ffi::ir_stream::eight_byte_encoding::encode_preamble(timestamp_pattern, - timestamp_pattern_syntax, - time_zone_id, ir_buf); + return ffi::ir_stream::eight_byte_encoding::encode_preamble( + timestamp_pattern, + timestamp_pattern_syntax, + time_zone_id, + ir_buf + ); } else { - return ffi::ir_stream::four_byte_encoding::encode_preamble(timestamp_pattern, - timestamp_pattern_syntax, - time_zone_id, - reference_timestamp, ir_buf); + return ffi::ir_stream::four_byte_encoding::encode_preamble( + timestamp_pattern, + timestamp_pattern_syntax, + time_zone_id, + reference_timestamp, + ir_buf + ); } } template -bool encode_message (epoch_time_ms_t timestamp, string_view message, string& logtype, - vector& ir_buf) { +bool encode_message( + epoch_time_ms_t timestamp, + string_view message, + string& logtype, + vector& ir_buf +) { static_assert(is_same_v || is_same_v); if constexpr (is_same_v) { - return ffi::ir_stream::eight_byte_encoding::encode_message(timestamp, message, logtype, - ir_buf); + return ffi::ir_stream::eight_byte_encoding::encode_message( + timestamp, + message, + logtype, + ir_buf + ); } else { - return ffi::ir_stream::four_byte_encoding::encode_message(timestamp, message, logtype, - ir_buf); + return ffi::ir_stream::four_byte_encoding::encode_message( + timestamp, + message, + logtype, + ir_buf + ); } } template -IRErrorCode decode_next_message (BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts) { +IRErrorCode +decode_next_message(BufferReader& reader, string& message, epoch_time_ms_t& decoded_ts) { static_assert(is_same_v || is_same_v); if constexpr (is_same_v) { - return ffi::ir_stream::eight_byte_encoding::decode_next_message(reader, message, - decoded_ts); + return ffi::ir_stream::eight_byte_encoding::decode_next_message( + reader, + message, + decoded_ts + ); } else { return ffi::ir_stream::four_byte_encoding::decode_next_message(reader, message, decoded_ts); } } -static void set_timestamp_info (const nlohmann::json& metadata_json, TimestampInfo& ts_info) { +static void set_timestamp_info(nlohmann::json const& metadata_json, TimestampInfo& ts_info) { ts_info.time_zone_id = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::TimeZoneIdKey); - ts_info.timestamp_pattern = - metadata_json.at(ffi::ir_stream::cProtocol::Metadata::TimestampPatternKey); - ts_info.timestamp_pattern_syntax = - metadata_json.at(ffi::ir_stream::cProtocol::Metadata::TimestampPatternSyntaxKey); + ts_info.timestamp_pattern + = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::TimestampPatternKey); + ts_info.timestamp_pattern_syntax + = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::TimestampPatternSyntaxKey); } TEST_CASE("get_encoding_type", "[ffi][get_encoding_type]") { bool is_four_bytes_encoding; // Test eight-byte encoding - vector eight_byte_encoding_vec{EightByteEncodingMagicNumber, - EightByteEncodingMagicNumber + MagicNumberLength}; + vector eight_byte_encoding_vec{ + EightByteEncodingMagicNumber, + EightByteEncodingMagicNumber + MagicNumberLength}; BufferReader eight_byte_ir_buffer{ - size_checked_pointer_cast(eight_byte_encoding_vec.data()), - eight_byte_encoding_vec.size() - }; - REQUIRE(get_encoding_type(eight_byte_ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Success); + size_checked_pointer_cast(eight_byte_encoding_vec.data()), + eight_byte_encoding_vec.size()}; + REQUIRE(get_encoding_type(eight_byte_ir_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test four-byte encoding - vector four_byte_encoding_vec{FourByteEncodingMagicNumber, - FourByteEncodingMagicNumber + MagicNumberLength}; + vector four_byte_encoding_vec{ + FourByteEncodingMagicNumber, + FourByteEncodingMagicNumber + MagicNumberLength}; BufferReader four_byte_ir_buffer{ - size_checked_pointer_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size() - }; - REQUIRE(get_encoding_type(four_byte_ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Success); + size_checked_pointer_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size()}; + REQUIRE(get_encoding_type(four_byte_ir_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test error on empty and incomplete ir_buffer - BufferReader empty_ir_buffer(size_checked_pointer_cast(four_byte_encoding_vec.data()), 0); - REQUIRE(get_encoding_type(empty_ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Incomplete_IR); + BufferReader empty_ir_buffer( + size_checked_pointer_cast(four_byte_encoding_vec.data()), + 0 + ); + REQUIRE(get_encoding_type(empty_ir_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Incomplete_IR); BufferReader incomplete_buffer{ - size_checked_pointer_cast(four_byte_encoding_vec.data()), - four_byte_encoding_vec.size() - 1 - }; - REQUIRE(get_encoding_type(incomplete_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Incomplete_IR); + size_checked_pointer_cast(four_byte_encoding_vec.data()), + four_byte_encoding_vec.size() - 1}; + REQUIRE(get_encoding_type(incomplete_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Incomplete_IR); // Test error on invalid encoding - const vector invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; + vector const invalid_ir_vec{0x02, 0x43, 0x24, 0x34}; BufferReader invalid_ir_buffer{ - size_checked_pointer_cast(invalid_ir_vec.data()), - invalid_ir_vec.size() - }; - REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Corrupted_IR); - + size_checked_pointer_cast(invalid_ir_vec.data()), + invalid_ir_vec.size()}; + REQUIRE(get_encoding_type(invalid_ir_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Corrupted_IR); } -TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encoded_variable_t, - eight_byte_encoded_variable_t) -{ +TEMPLATE_TEST_CASE( + "decode_preamble", + "[ffi][decode_preamble]", + four_byte_encoded_variable_t, + eight_byte_encoded_variable_t +) { vector ir_buf; constexpr char timestamp_pattern[] = "%Y-%m-%d %H:%M:%S,%3"; constexpr char timestamp_pattern_syntax[] = "yyyy-MM-dd HH:mm:ss"; constexpr char time_zone_id[] = "Asia/Tokyo"; - const epoch_time_ms_t reference_ts = get_current_ts(); - REQUIRE(encode_preamble(timestamp_pattern, timestamp_pattern_syntax, time_zone_id, - reference_ts, ir_buf)); - const size_t encoded_preamble_end_pos = ir_buf.size(); + epoch_time_ms_t const reference_ts = get_current_ts(); + REQUIRE(encode_preamble( + timestamp_pattern, + timestamp_pattern_syntax, + time_zone_id, + reference_ts, + ir_buf + )); + size_t const encoded_preamble_end_pos = ir_buf.size(); // Check if encoding type is properly read - BufferReader ir_buffer{ - size_checked_pointer_cast(ir_buf.data()), ir_buf.size() - }; + BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Success); + REQUIRE(get_encoding_type(ir_buffer, is_four_bytes_encoding) == IRErrorCode::IRErrorCode_Success + ); REQUIRE(match_encoding_type(is_four_bytes_encoding)); REQUIRE(MagicNumberLength == ir_buffer.get_pos()); @@ -281,16 +321,16 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode encoded_tag_t metadata_type{0}; size_t metadata_pos{0}; uint16_t metadata_size{0}; - REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) == - IRErrorCode::IRErrorCode_Success); + REQUIRE(decode_preamble(ir_buffer, metadata_type, metadata_pos, metadata_size) + == IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); char* metadata_ptr{size_checked_pointer_cast(ir_buf.data()) + metadata_pos}; string_view json_metadata{metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); - REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == - metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); + REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue + == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); REQUIRE(ffi::ir_stream::cProtocol::Metadata::EncodingJson == metadata_type); set_timestamp_info(metadata_json, ts_info); REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); @@ -299,97 +339,104 @@ TEMPLATE_TEST_CASE("decode_preamble", "[ffi][decode_preamble]", four_byte_encode REQUIRE(encoded_preamble_end_pos == ir_buffer.get_pos()); if constexpr (is_same_v) { - REQUIRE(reference_ts == - std::stoll( + REQUIRE(reference_ts + == std::stoll( metadata_json.at(ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey) - .get())); + .get() + )); } // Test if preamble can be decoded by the string copy method std::vector json_metadata_vec; ir_buffer.seek_from_begin(MagicNumberLength); - REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) == - IRErrorCode::IRErrorCode_Success); - string_view json_metadata_copied {size_checked_pointer_cast(json_metadata_vec.data()), json_metadata_vec.size()}; + REQUIRE(decode_preamble(ir_buffer, metadata_type, json_metadata_vec) + == IRErrorCode::IRErrorCode_Success); + string_view json_metadata_copied{ + size_checked_pointer_cast(json_metadata_vec.data()), + json_metadata_vec.size()}; // Crosscheck with the json_metadata decoded previously - REQUIRE (json_metadata_copied == json_metadata); + REQUIRE(json_metadata_copied == json_metadata); // Test if incomplete IR can be detected ir_buf.resize(encoded_preamble_end_pos - 1); - BufferReader incomplete_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), - ir_buf.size()}; + BufferReader incomplete_preamble_buffer{ + size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; incomplete_preamble_buffer.seek_from_begin(MagicNumberLength); - REQUIRE(decode_preamble( - incomplete_preamble_buffer, metadata_type, metadata_pos, metadata_size) == - IRErrorCode::IRErrorCode_Incomplete_IR); + REQUIRE(decode_preamble(incomplete_preamble_buffer, metadata_type, metadata_pos, metadata_size) + == IRErrorCode::IRErrorCode_Incomplete_IR); // Test if corrupted IR can be detected ir_buf[MagicNumberLength] = 0x23; - BufferReader corrupted_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), - ir_buf.size()}; - REQUIRE(decode_preamble( - corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) == - IRErrorCode::IRErrorCode_Corrupted_IR); + BufferReader corrupted_preamble_buffer{ + size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; + REQUIRE(decode_preamble(corrupted_preamble_buffer, metadata_type, metadata_pos, metadata_size) + == IRErrorCode::IRErrorCode_Corrupted_IR); } -TEMPLATE_TEST_CASE("decode_next_message_general", "[ffi][decode_next_message]", - four_byte_encoded_variable_t, eight_byte_encoded_variable_t) -{ +TEMPLATE_TEST_CASE( + "decode_next_message_general", + "[ffi][decode_next_message]", + four_byte_encoded_variable_t, + eight_byte_encoded_variable_t +) { vector ir_buf; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; - string message = "Static <\text>, dictVar1, 123, 456.7 dictVar2, 987, 654.3," + - placeholder_as_string + " end of static text"; + string message = "Static <\text>, dictVar1, 123, 456.7 dictVar2, 987, 654.3," + + placeholder_as_string + " end of static text"; epoch_time_ms_t reference_timestamp = get_next_timestamp_for_test(); REQUIRE(true == encode_message(reference_timestamp, message, logtype, ir_buf)); - const size_t encoded_message_end_pos = ir_buf.size(); - const size_t encoded_message_start_pos = 0; + size_t const encoded_message_end_pos = ir_buf.size(); + size_t const encoded_message_start_pos = 0; - BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; + BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; string decoded_message; epoch_time_ms_t timestamp; // Test if message can be decoded properly - REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(ir_buffer, decoded_message, timestamp)); + REQUIRE(IRErrorCode::IRErrorCode_Success + == decode_next_message(ir_buffer, decoded_message, timestamp)); REQUIRE(message == decoded_message); REQUIRE(timestamp == reference_timestamp); REQUIRE(ir_buffer.get_pos() == encoded_message_end_pos); // Test corrupted IR ir_buffer.seek_from_begin(encoded_message_start_pos + 1); - REQUIRE(IRErrorCode::IRErrorCode_Corrupted_IR == - decode_next_message(ir_buffer, message, timestamp)); + REQUIRE(IRErrorCode::IRErrorCode_Corrupted_IR + == decode_next_message(ir_buffer, message, timestamp)); // Test incomplete IR ir_buf.resize(encoded_message_end_pos - 4); - BufferReader incomplete_preamble_buffer{size_checked_pointer_cast(ir_buf.data()), - ir_buf.size()}; - REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR == - decode_next_message(incomplete_preamble_buffer, message, timestamp)); + BufferReader incomplete_preamble_buffer{ + size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; + REQUIRE(IRErrorCode::IRErrorCode_Incomplete_IR + == decode_next_message(incomplete_preamble_buffer, message, timestamp)); } // NOTE: This test only tests eight_byte_encoded_variable_t because we trigger // IRErrorCode_Decode_Error by manually modifying the logtype within the IR, and // this is easier for the eight_byte_encoded_variable_t case. -TEST_CASE("message_decode_error", "[ffi][decode_next_message]") -{ +TEST_CASE("message_decode_error", "[ffi][decode_next_message]") { vector ir_buf; string logtype; string placeholder_as_string{enum_to_underlying_type(VariablePlaceholder::Dictionary)}; - string message = "Static <\text>, dictVar1, 123, 456.7 dictVar2, 987, 654.3," + - placeholder_as_string + " end of static text"; + string message = "Static <\text>, dictVar1, 123, 456.7 dictVar2, 987, 654.3," + + placeholder_as_string + " end of static text"; epoch_time_ms_t reference_ts = get_next_timestamp_for_test(); - REQUIRE(true == encode_message(reference_ts, message, - logtype, ir_buf)); + REQUIRE(true + == encode_message(reference_ts, message, logtype, ir_buf) + ); // Find the end of the encoded logtype which is before the encoded timestamp // The timestamp is encoded as tagbyte + eight_byte_encoded_variable_t - size_t timestamp_encoding_size = sizeof(ffi::ir_stream::cProtocol::Payload::TimestampVal) + - sizeof(eight_byte_encoded_variable_t); - const size_t logtype_end_pos = ir_buf.size() - timestamp_encoding_size; + size_t timestamp_encoding_size = sizeof(ffi::ir_stream::cProtocol::Payload::TimestampVal) + + sizeof(eight_byte_encoded_variable_t); + size_t const logtype_end_pos = ir_buf.size() - timestamp_encoding_size; string decoded_message; epoch_time_ms_t timestamp; @@ -398,26 +445,28 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") auto ir_with_extra_escape{ir_buf}; ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; BufferReader ir_with_extra_escape_buffer{ - size_checked_pointer_cast(ir_with_extra_escape.data()), - ir_with_extra_escape.size() - }; - REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_with_extra_escape_buffer, - decoded_message, - timestamp)); + size_checked_pointer_cast(ir_with_extra_escape.data()), + ir_with_extra_escape.size()}; + REQUIRE(IRErrorCode::IRErrorCode_Decode_Error + == decode_next_message( + ir_with_extra_escape_buffer, + decoded_message, + timestamp + )); // Test if an extra placeholder triggers a decoder error auto ir_with_extra_placeholder{ir_buf}; - ir_with_extra_placeholder.at(logtype_end_pos - 1) = - enum_to_underlying_type(VariablePlaceholder::Dictionary); + ir_with_extra_placeholder.at(logtype_end_pos - 1) + = enum_to_underlying_type(VariablePlaceholder::Dictionary); BufferReader ir_with_extra_placeholder_buffer{ - size_checked_pointer_cast(ir_with_extra_placeholder.data()), - ir_with_extra_placeholder.size() - }; - REQUIRE(IRErrorCode::IRErrorCode_Decode_Error == - decode_next_message(ir_with_extra_placeholder_buffer, - decoded_message, - timestamp)); + size_checked_pointer_cast(ir_with_extra_placeholder.data()), + ir_with_extra_placeholder.size()}; + REQUIRE(IRErrorCode::IRErrorCode_Decode_Error + == decode_next_message( + ir_with_extra_placeholder_buffer, + decoded_message, + timestamp + )); } TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_message]") { @@ -427,21 +476,33 @@ TEST_CASE("decode_next_message_four_byte_negative_delta", "[ffi][decode_next_mes string logtype; epoch_time_ms_t reference_delta_ts_negative = -5; - REQUIRE(true == encode_message(reference_delta_ts_negative, - message, logtype, ir_buf)); - - BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; + REQUIRE(true + == encode_message( + reference_delta_ts_negative, + message, + logtype, + ir_buf + )); + + BufferReader ir_buffer{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; string decoded_message; epoch_time_ms_t delta_ts; - REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(ir_buffer, decoded_message, - delta_ts)); + REQUIRE(IRErrorCode::IRErrorCode_Success + == decode_next_message( + ir_buffer, + decoded_message, + delta_ts + )); REQUIRE(message == decoded_message); REQUIRE(delta_ts == reference_delta_ts_negative); } -TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", - four_byte_encoded_variable_t, eight_byte_encoded_variable_t) { +TEMPLATE_TEST_CASE( + "decode_ir_complete", + "[ffi][decode_next_message]", + four_byte_encoded_variable_t, + eight_byte_encoded_variable_t +) { vector ir_buf; string logtype; @@ -449,9 +510,14 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", constexpr char timestamp_pattern[] = "%Y-%m-%d %H:%M:%S,%3"; constexpr char timestamp_pattern_syntax[] = "yyyy-MM-dd HH:mm:ss"; constexpr char time_zone_id[] = "Asia/Tokyo"; - REQUIRE(encode_preamble(timestamp_pattern, timestamp_pattern_syntax, time_zone_id, - preamble_ts, ir_buf)); - const size_t encoded_preamble_end_pos = ir_buf.size(); + REQUIRE(encode_preamble( + timestamp_pattern, + timestamp_pattern_syntax, + time_zone_id, + preamble_ts, + ir_buf + )); + size_t const encoded_preamble_end_pos = ir_buf.size(); string message; epoch_time_ms_t ts; @@ -473,12 +539,13 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", reference_messages.push_back(message); reference_timestamps.push_back(ts); - BufferReader complete_ir_buffer{size_checked_pointer_cast(ir_buf.data()), - ir_buf.size()}; + BufferReader complete_ir_buffer{ + size_checked_pointer_cast(ir_buf.data()), + ir_buf.size()}; bool is_four_bytes_encoding; - REQUIRE(get_encoding_type(complete_ir_buffer, is_four_bytes_encoding) == - IRErrorCode::IRErrorCode_Success); + REQUIRE(get_encoding_type(complete_ir_buffer, is_four_bytes_encoding) + == IRErrorCode::IRErrorCode_Success); REQUIRE(match_encoding_type(is_four_bytes_encoding)); // Test if preamble can be properly decoded @@ -486,15 +553,15 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", encoded_tag_t metadata_type; size_t metadata_pos; uint16_t metadata_size; - REQUIRE(decode_preamble(complete_ir_buffer, metadata_type, metadata_pos, metadata_size) == - IRErrorCode::IRErrorCode_Success); + REQUIRE(decode_preamble(complete_ir_buffer, metadata_type, metadata_pos, metadata_size) + == IRErrorCode::IRErrorCode_Success); REQUIRE(encoded_preamble_end_pos == complete_ir_buffer.get_pos()); auto* json_metadata_ptr{size_checked_pointer_cast(ir_buf.data() + metadata_pos)}; - string_view json_metadata {json_metadata_ptr, metadata_size}; + string_view json_metadata{json_metadata_ptr, metadata_size}; auto metadata_json = nlohmann::json::parse(json_metadata); - REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue == - metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); + REQUIRE(ffi::ir_stream::cProtocol::Metadata::VersionValue + == metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey)); REQUIRE(ffi::ir_stream::cProtocol::Metadata::EncodingJson == metadata_type); set_timestamp_info(metadata_json, ts_info); REQUIRE(timestamp_pattern_syntax == ts_info.timestamp_pattern_syntax); @@ -504,9 +571,8 @@ TEMPLATE_TEST_CASE("decode_ir_complete", "[ffi][decode_next_message]", string decoded_message; epoch_time_ms_t timestamp; for (size_t ix = 0; ix < reference_messages.size(); ix++) { - REQUIRE(IRErrorCode::IRErrorCode_Success == - decode_next_message(complete_ir_buffer, decoded_message, - timestamp)); + REQUIRE(IRErrorCode::IRErrorCode_Success + == decode_next_message(complete_ir_buffer, decoded_message, timestamp)); REQUIRE(decoded_message == reference_messages[ix]); REQUIRE(timestamp == reference_timestamps[ix]); } From f4a5cef4983665cbe239ff0a0100bad241a8a19c Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 22:38:38 -0400 Subject: [PATCH 074/121] Replace unique ptr with vector --- components/core/src/BufferedFileReader.cpp | 34 +++++++++------------- components/core/src/BufferedFileReader.hpp | 13 ++++----- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 509398986..754d4f870 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -9,7 +9,6 @@ // C++ standard libraries #include -using std::make_unique; using std::string; namespace { @@ -43,8 +42,7 @@ BufferedFileReader::BufferedFileReader(size_t base_buffer_size) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } m_base_buffer_size = base_buffer_size; - m_buffer_size = m_base_buffer_size; - m_buffer = make_unique(m_buffer_size); + m_buffer.resize(m_base_buffer_size); } BufferedFileReader::~BufferedFileReader() { @@ -85,7 +83,7 @@ auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { if (-1 == offset) { return ErrorCode_errno; } - m_buffer_reader.emplace(m_buffer.get(), 0); + m_buffer_reader.emplace(m_buffer.data(), 0); m_buffer_begin_pos = pos; } else { auto const num_bytes_to_refill = pos - get_buffer_end_pos(); @@ -206,7 +204,7 @@ auto BufferedFileReader::try_open(string const& path) -> ErrorCode { m_path = path; m_file_pos = 0; m_buffer_begin_pos = 0; - m_buffer_reader.emplace(m_buffer.get(), 0); + m_buffer_reader.emplace(m_buffer.data(), 0); m_highest_read_pos = 0; return ErrorCode_Success; } @@ -234,8 +232,7 @@ void BufferedFileReader::close() { m_fd = -1; if (m_checkpoint_pos.has_value()) { - m_buffer_size = m_base_buffer_size; - m_buffer = make_unique(m_buffer_size); + m_buffer.resize(m_base_buffer_size); m_checkpoint_pos.reset(); } } @@ -292,7 +289,7 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err auto const buffer_end_pos = get_buffer_end_pos(); auto const data_size = m_buffer_reader->get_buffer_size(); - auto const available_buffer_space = m_buffer_size - data_size; + auto const available_buffer_space = m_buffer.size() - data_size; size_t buf_internal_pos{0}; size_t bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); @@ -303,10 +300,7 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err // Grow the buffer if bytes_to_read is more // than available space in the buffer if (bytes_to_read > available_buffer_space) { - m_buffer_size = data_size + bytes_to_read; - auto new_buffer = make_unique(m_buffer_size); - memcpy(new_buffer.get(), m_buffer.get(), data_size); - m_buffer = std::move(new_buffer); + m_buffer.resize(data_size + bytes_to_read); } buf_internal_pos = data_size; } else { @@ -321,7 +315,7 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err auto error_code = try_read_into_buffer( m_fd, - m_buffer.get() + buf_internal_pos, + &m_buffer[buf_internal_pos], bytes_to_read, num_bytes_refilled ); @@ -329,7 +323,7 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err return error_code; } m_buffer_reader - .emplace(m_buffer.get(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); + .emplace(m_buffer.data(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); return ErrorCode_Success; } @@ -338,15 +332,15 @@ auto BufferedFileReader::resize_buffer_from_pos(size_t pos) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - auto const copy_size = m_buffer_reader->get_buffer_size() - pos; + auto const new_data_size = m_buffer_reader->get_buffer_size() - pos; // Use a quantized size for the underlying buffer size - m_buffer_size = quantize_to_buffer_size(copy_size); - auto new_buffer = make_unique(m_buffer_size); - memcpy(new_buffer.get(), &m_buffer[pos], copy_size); - m_buffer = std::move(new_buffer); + auto const buffer_size = quantize_to_buffer_size(new_data_size); + + m_buffer.erase(m_buffer.begin(), m_buffer.begin() + static_cast(pos)); + m_buffer.resize(buffer_size); m_buffer_begin_pos += pos; - m_buffer_reader.emplace(m_buffer.get(), copy_size); + m_buffer_reader.emplace(m_buffer.data(), new_data_size); } auto BufferedFileReader::update_file_pos(size_t pos) -> void { diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 9aa7bd395..893ab5f69 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -1,13 +1,12 @@ #ifndef BUFFEREDFILEREADER_HPP #define BUFFEREDFILEREADER_HPP -// C standard libraries - // C++ libraries #include #include #include #include +#include // Project headers #include "BufferReader.hpp" @@ -217,8 +216,8 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] auto refill_reader_buffer(size_t refill_size) -> ErrorCode; /** - * Resize the internal reader buffer and copy over data from the original - * buffer staring from pos to the beginning of the resized the buffer + * Discard the data before pos from internal reader buffer and resize the + * buffer * @param pos */ auto resize_buffer_from_pos(size_t pos) -> void; @@ -248,13 +247,11 @@ class BufferedFileReader : public ReaderInterface { size_t m_file_pos{0}; // Buffer specific data - std::unique_ptr m_buffer; + std::vector m_buffer; + size_t m_base_buffer_size; std::optional m_buffer_reader; size_t m_buffer_begin_pos{0}; - // Values for buffer related calculation - size_t m_base_buffer_size; - size_t m_buffer_size; // Variables for checkpoint support std::optional m_checkpoint_pos; size_t m_highest_read_pos{0}; From 1e55966d4011cb6b11baeb617e257955690f5e7e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 23:17:59 -0400 Subject: [PATCH 075/121] Add error code to close function and some small refactoring --- components/core/src/BufferedFileReader.cpp | 15 +++++++++------ components/core/src/BufferedFileReader.hpp | 2 +- .../core/tests/test-BufferedFileReader.cpp | 17 ++++++++++------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 754d4f870..38ef311dc 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -46,7 +46,7 @@ BufferedFileReader::BufferedFileReader(size_t base_buffer_size) { } BufferedFileReader::~BufferedFileReader() { - close(); + std::ignore = close(); } auto BufferedFileReader::try_get_pos(size_t& pos) -> ErrorCode { @@ -224,18 +224,21 @@ void BufferedFileReader::open(string const& path) { } } -void BufferedFileReader::close() { +auto BufferedFileReader::close() -> ErrorCode { + int close_result{0}; if (-1 != m_fd) { - // NOTE: We don't check errors for fclose since it seems - // the only reason it could fail is if it was interrupted by a signal - ::close(m_fd); - m_fd = -1; + close_result = ::close(m_fd); + m_fd = -1; if (m_checkpoint_pos.has_value()) { m_buffer.resize(m_base_buffer_size); m_checkpoint_pos.reset(); } } + if (0 != close_result) { + return ErrorCode_errno; + } + return ErrorCode_Success; } auto BufferedFileReader::set_checkpoint() -> size_t { diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 893ab5f69..3c9993282 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -140,7 +140,7 @@ class BufferedFileReader : public ReaderInterface { /** * Closes the file if it's open */ - auto close() -> void; + [[nodiscard]] auto close() -> ErrorCode; [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 0234b05a7..eb568c10c 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -60,7 +60,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { size_t read_size4 = 1; REQUIRE(ErrorCode_EndOfFile == file_reader.try_read(read_buffer + buffer_offset, read_size4, num_bytes_read)); - file_reader.close(); + std::ignore = file_reader.close(); } SECTION("Simple Seek without checkpoint") { @@ -223,21 +223,24 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { file_writer.write(test_data, test_data_size); file_writer.close(); - BufferedFileReader buffered_file_reader; - buffered_file_reader.open(test_file_path); + BufferedFileReader file_reader; + file_reader.open(test_file_path); std::string test_string; - FileReader file_reader; - file_reader.open(test_file_path); + FileReader ref_file_reader; + ref_file_reader.open(test_file_path); std::string ref_string; ErrorCode error_code = ErrorCode_Success; char delimiter = (char)('a' + (std::rand() % 26)); while (ErrorCode_EndOfFile != error_code) { - error_code = file_reader.try_read_to_delimiter(delimiter, true, false, ref_string); + error_code = ref_file_reader.try_read_to_delimiter(delimiter, true, false, ref_string); auto error_code2 - = buffered_file_reader.try_read_to_delimiter(delimiter, true, false, test_string); + = file_reader.try_read_to_delimiter(delimiter, true, false, test_string); REQUIRE(error_code2 == error_code); REQUIRE(test_string == ref_string); } + + ref_file_reader.close(); + std::ignore = file_reader.close(); } From d8a50d49c1449c779b1261e515a964091f2dd50d Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 9 Aug 2023 23:31:14 -0400 Subject: [PATCH 076/121] fix --- components/core/src/BufferedFileReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 38ef311dc..c282786f7 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -192,7 +192,7 @@ auto BufferedFileReader::try_read_to_delimiter( auto BufferedFileReader::try_open(string const& path) -> ErrorCode { // Cleanup in case caller forgot to call close before calling this function - close(); + std::ignore = close(); m_fd = ::open(path.c_str(), O_RDONLY); if (-1 == m_fd) { From f3f9ca50907acfa3429489039b3fe41b834198ab Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 10 Aug 2023 10:19:30 -0400 Subject: [PATCH 077/121] Manually cherrypick the changes --- components/core/CMakeLists.txt | 18 ++ .../core/src/EncodedVariableInterpreter.cpp | 34 +++ .../core/src/EncodedVariableInterpreter.hpp | 1 + components/core/src/IrMessageParser.cpp | 232 ++++++++++++++++++ components/core/src/IrMessageParser.hpp | 57 +++++ .../core/src/LogTypeDictionaryEntry.cpp | 8 + .../core/src/LogTypeDictionaryEntry.hpp | 10 + components/core/src/ParsedIrMessage.cpp | 69 ++++++ components/core/src/ParsedIrMessage.hpp | 107 ++++++++ components/core/src/clp/FileCompressor.cpp | 70 +++++- components/core/src/clp/FileCompressor.hpp | 19 ++ .../src/ffi/ir_stream/decoding_methods.cpp | 216 +++++----------- .../src/ffi/ir_stream/decoding_methods.hpp | 83 ++++++- .../src/ffi/ir_stream/decoding_methods.tpp | 144 +++++++++++ .../src/streaming_archive/writer/Archive.cpp | 38 +++ .../src/streaming_archive/writer/Archive.hpp | 12 + 16 files changed, 966 insertions(+), 152 deletions(-) create mode 100644 components/core/src/IrMessageParser.cpp create mode 100644 components/core/src/IrMessageParser.hpp create mode 100644 components/core/src/ParsedIrMessage.cpp create mode 100644 components/core/src/ParsedIrMessage.hpp create mode 100644 components/core/src/ffi/ir_stream/decoding_methods.tpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 1717a7a0d..690e006e0 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -191,6 +191,15 @@ set(SOURCE_FILES_clp src/clp/StructuredFileToCompress.hpp src/clp/utils.cpp src/clp/utils.hpp + src/ffi/encoding_methods.cpp + src/ffi/encoding_methods.hpp + src/ffi/encoding_methods.tpp + src/ffi/ir_stream/byteswap.hpp + src/ffi/ir_stream/decoding_methods.cpp + src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/decoding_methods.tpp + src/ffi/ir_stream/encoding_methods.cpp + src/ffi/ir_stream/encoding_methods.hpp src/compressor_frontend/Constants.hpp src/compressor_frontend/finite_automata/RegexAST.hpp src/compressor_frontend/finite_automata/RegexAST.tpp @@ -239,6 +248,8 @@ set(SOURCE_FILES_clp src/GlobalMySQLMetadataDB.hpp src/GlobalSQLiteMetadataDB.cpp src/GlobalSQLiteMetadataDB.hpp + src/IrMessageParser.cpp + src/IrMessageParser.hpp src/LibarchiveFileReader.cpp src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp @@ -261,6 +272,8 @@ set(SOURCE_FILES_clp src/PageAllocatedVector.hpp src/ParsedMessage.cpp src/ParsedMessage.hpp + src/ParsedIrMessage.cpp + src/ParsedIrMessage.hpp src/Platform.hpp src/Profiler.cpp src/Profiler.hpp @@ -725,6 +738,7 @@ set(SOURCE_FILES_unitTest src/ffi/ir_stream/byteswap.hpp src/ffi/ir_stream/decoding_methods.cpp src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/decoding_methods.tpp src/ffi/ir_stream/encoding_methods.cpp src/ffi/ir_stream/encoding_methods.hpp src/ffi/ir_stream/protocol_constants.hpp @@ -756,6 +770,8 @@ set(SOURCE_FILES_unitTest src/GlobalSQLiteMetadataDB.hpp src/Grep.cpp src/Grep.hpp + src/IrMessageParser.cpp + src/IrMessageParser.hpp src/LibarchiveFileReader.cpp src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp @@ -776,6 +792,8 @@ set(SOURCE_FILES_unitTest src/MySQLPreparedStatement.hpp src/PageAllocatedVector.cpp src/PageAllocatedVector.hpp + src/ParsedIrMessage.cpp + src/ParsedIrMessage.hpp src/ParsedMessage.cpp src/ParsedMessage.hpp src/Platform.hpp diff --git a/components/core/src/EncodedVariableInterpreter.cpp b/components/core/src/EncodedVariableInterpreter.cpp index 3545fce30..8cf7fb365 100644 --- a/components/core/src/EncodedVariableInterpreter.cpp +++ b/components/core/src/EncodedVariableInterpreter.cpp @@ -195,6 +195,40 @@ void EncodedVariableInterpreter::convert_encoded_float_to_string (encoded_variab value[value_length - 1 - decimal_pos] = '.'; } +encoded_variable_t +EncodedVariableInterpreter::convert_four_bytes_float_to_clp_encoded_float ( + encoded_variable_t encoded_float) +{ + encoded_float = bit_cast(encoded_float); + + size_t decimal_pos; + size_t num_digits; + size_t digits; + bool is_negative; + + // Decode according to the format described in encode_string_as_float_compact_var + decimal_pos = (encoded_float & 0x07) + 1; + encoded_float >>= 3; + num_digits = (encoded_float & 0x07) + 1; + encoded_float >>= 3; + digits = encoded_float & ffi::cFourByteEncodedFloatDigitsBitMask; + encoded_float >>= 25; + is_negative = encoded_float > 0; + + // encode again. + uint64_t clp_encoded_float = 0; + if (is_negative) { + clp_encoded_float = 1; + } + clp_encoded_float <<= 55; // 1 unused + 54 for digits of the float + clp_encoded_float |= digits & cEightByteEncodedFloatDigitsBitMask; + clp_encoded_float <<= 4; + clp_encoded_float |= (num_digits - 1) & 0x0F; + clp_encoded_float <<= 4; + clp_encoded_float |= (decimal_pos - 1) & 0x0F; + return bit_cast(clp_encoded_float); +} + void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& message, LogTypeDictionaryEntry& logtype_dict_entry, VariableDictionaryWriter& var_dict, vector& encoded_vars, vector& var_ids) diff --git a/components/core/src/EncodedVariableInterpreter.hpp b/components/core/src/EncodedVariableInterpreter.hpp index f0a71d3bc..1b041ad84 100644 --- a/components/core/src/EncodedVariableInterpreter.hpp +++ b/components/core/src/EncodedVariableInterpreter.hpp @@ -63,6 +63,7 @@ class EncodedVariableInterpreter { */ static void convert_encoded_float_to_string (encoded_variable_t encoded_var, std::string& value); + static encoded_variable_t convert_four_bytes_float_to_clp_encoded_float (encoded_variable_t var); /** * Parses all variables from a message (while constructing the logtype) and encodes them (adding them to the variable dictionary if necessary) * @param message diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp new file mode 100644 index 000000000..8273020c3 --- /dev/null +++ b/components/core/src/IrMessageParser.cpp @@ -0,0 +1,232 @@ +#include "IrMessageParser.hpp" + +// C standard libraries + +// C++ standard libraries + +// Project headers +#include "BufferReader.hpp" +#include "EncodedVariableInterpreter.hpp" +#include "ffi/encoding_methods.hpp" +#include "ffi/ir_stream/protocol_constants.hpp" + +// spdlog +#include "spdlog/spdlog.h" + +// json +#include "../../../submodules/json/single_include/nlohmann/json.hpp" + +using ffi::cVariablePlaceholderEscapeCharacter; +using ffi::four_byte_encoded_variable_t; +using ffi::eight_byte_encoded_variable_t; +using ffi::ir_stream::cProtocol::MagicNumberLength; +using ffi::ir_stream::IRErrorCode; +using ffi::VariablePlaceholder; +using std::string; +using std::vector; + +/** + * Constructs the class by setting the internal reader, parsing the metadata + * and initializing variable based on the metadata + * @param reader + * @throw OperationFailed if the reader doesn't contain IR encoded data, + * or IR data that can't be properly decoded + */ +IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { + + if (false == is_ir_encoded(m_reader, m_is_four_bytes_encoded)) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + string json_metadata; + if (false == decode_json_preamble(json_metadata)) { + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + const string mocked_ts_pattern = "%Y-%m-%dT%H:%M:%S.%3"; + try { + auto metadata_json = nlohmann::json::parse(json_metadata); + string version = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey); + if (version != ffi::ir_stream::cProtocol::Metadata::VersionValue) { + SPDLOG_ERROR("Input IR has unsupported version {}", version); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + // For now, use a fixed timestamp pattern + m_ts_pattern = TimestampPattern(0, mocked_ts_pattern); + + if (m_is_four_bytes_encoded) { + m_reference_timestamp = std::stoll(metadata_json.at( + ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey).get()); + m_msg.set_ts(m_reference_timestamp); + } + + } catch (const nlohmann::json::parse_error& e) { + SPDLOG_ERROR("Failed to parse json metadata from reader"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_msg.set_ts_pattern(&m_ts_pattern); +} + +bool IrMessageParser::parse_next_encoded_message () { + if (m_is_four_bytes_encoded) { + return parse_next_four_bytes_message(); + } + return parse_next_eight_bytes_message(); +} + +bool IrMessageParser::parse_next_eight_bytes_message () { + m_msg.clear(); + + epochtime_t ts; + vector encoded_vars; + vector dict_vars; + string logtype; + + auto error_code = ffi::ir_stream::generic_parse_tokens( + m_reader, logtype, encoded_vars, dict_vars, ts + ); + + if (IRErrorCode::IRErrorCode_Success != error_code) { + if (IRErrorCode::IRErrorCode_Eof != error_code) { + SPDLOG_ERROR("Corrupted IR, error code: {}", error_code); + } + return false; + } + + auto constant_handler = [this] (const std::string& value, size_t begin_pos, size_t length) { + m_msg.append_to_logtype(value, begin_pos, length); + }; + + auto encoded_int_handler = [this] (eight_byte_encoded_variable_t value) { + auto decoded_int = ffi::decode_integer_var(value); + m_msg.add_encoded_integer(value, decoded_int.length()); + }; + + auto encoded_float_handler = [this] (eight_byte_encoded_variable_t encoded_float) { + auto decoded_float = ffi::decode_float_var(encoded_float); + m_msg.add_encoded_float(encoded_float, decoded_float.size()); + }; + + auto dict_var_handler = [this] (const string& dict_var) { + m_msg.add_dictionary_var(dict_var); + }; + + // handle timestamp + m_msg.set_ts(ts); + try { + ffi::ir_stream::generic_decode_message(logtype, encoded_vars, dict_vars, + constant_handler, encoded_int_handler, + encoded_float_handler, dict_var_handler); + } catch (ffi::ir_stream::DecodingException& e) { + SPDLOG_ERROR("Decoding failed with exception {}", e.what()); + return false; + } + + return true; +} + +bool IrMessageParser::parse_next_four_bytes_message () { + m_msg.clear(); + + epochtime_t ts; + vector encoded_vars; + vector dict_vars; + string logtype; + + auto error_code = ffi::ir_stream::generic_parse_tokens( + m_reader, logtype, encoded_vars, dict_vars, ts + ); + + if (IRErrorCode::IRErrorCode_Success != error_code) { + if (IRErrorCode::IRErrorCode_Eof != error_code) { + SPDLOG_ERROR("Corrupted IR, error code: {}", error_code); + } + return false; + } + + auto constant_handler = [this] (const std::string& value, size_t begin_pos, size_t length) { + m_msg.append_to_logtype(value, begin_pos, length); + }; + + auto encoded_int_handler = [this] (four_byte_encoded_variable_t value) { + // assume that we need the actual size + auto decoded_int = ffi::decode_integer_var(value); + m_msg.add_encoded_integer(value, decoded_int.length()); + }; + + auto encoded_float_handler = [this] (four_byte_encoded_variable_t encoded_float) { + auto decoded_float = ffi::decode_float_var(encoded_float); + auto converted_float = EncodedVariableInterpreter::convert_four_bytes_float_to_clp_encoded_float(encoded_float); + m_msg.add_encoded_float(converted_float, decoded_float.size()); + }; + + auto dict_var_handler = [this] (const string& dict_var) { + encoded_variable_t converted_var; + if (EncodedVariableInterpreter::convert_string_to_representable_integer_var(dict_var, converted_var)) { + m_msg.add_encoded_integer(converted_var, dict_var.size()); + } else if (EncodedVariableInterpreter::convert_string_to_representable_float_var(dict_var, converted_var)) { + m_msg.add_encoded_float(converted_var, dict_var.size()); + } else { + m_msg.add_dictionary_var(dict_var); + } + }; + + // handle timestamp + m_reference_timestamp += ts; + m_msg.set_ts(m_reference_timestamp); + try { + ffi::ir_stream::generic_decode_message(logtype, encoded_vars, dict_vars, + constant_handler, encoded_int_handler, + encoded_float_handler, dict_var_handler); + } catch (ffi::ir_stream::DecodingException& e) { + SPDLOG_ERROR("Decoding failed with exception {}", e.what()); + return false; + } + + return true; +} + +bool IrMessageParser::is_ir_encoded (ReaderInterface& reader, bool& is_four_bytes_encoded) { + // Note. currently this method doesn't recover file pos. + if (ffi::ir_stream::IRErrorCode_Success != + ffi::ir_stream::get_encoding_type(reader, is_four_bytes_encoded)) { + return false; + } + return true; +} + +bool IrMessageParser::is_ir_encoded (size_t sequence_length, const char* data) { + if (sequence_length < MagicNumberLength) { + return false; + } + bool is_four_bytes_encoded; + BufferReader encoding_data (data, MagicNumberLength); + if (ffi::ir_stream::IRErrorCode_Success != + ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) { + return false; + } + return true; +} + +bool IrMessageParser::decode_json_preamble (std::string& json_metadata) { + // Decode and parse metadata + ffi::ir_stream::encoded_tag_t metadata_type; + std::vector metadata_vec; + + if (ffi::ir_stream::IRErrorCode_Success != + ffi::ir_stream::decode_preamble(m_reader, metadata_type, metadata_vec)) { + SPDLOG_ERROR("Failed to parse metadata"); + return false; + } + + if (ffi::ir_stream::cProtocol::Metadata::EncodingJson != metadata_type) { + SPDLOG_ERROR("Unexpected metadata type {}", metadata_type); + return false; + } + + json_metadata.assign(reinterpret_cast(metadata_vec.data()), + metadata_vec.size()); + + return true; +} \ No newline at end of file diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp new file mode 100644 index 000000000..519e95ac1 --- /dev/null +++ b/components/core/src/IrMessageParser.hpp @@ -0,0 +1,57 @@ + +#ifndef IrMessageParser_HPP +#define IrMessageParser_HPP + +// C standard libraries + +// C++ standard libraries + +// Project headers +#include "TraceableException.hpp" +#include "ffi/ir_stream/decoding_methods.hpp" +#include "ParsedIrMessage.hpp" + +/* + * Class representing the parser that parses messages from encoded IR and + * converts the message into CLP encoding format + */ +class IrMessageParser { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : + TraceableException (error_code, filename, line_number) {} + + // Methods + const char* what () const noexcept override { + return "IrMessageParser operation failed"; + } + }; + // Constructor + IrMessageParser (ReaderInterface& reader); + + // Methods + static bool is_ir_encoded (size_t sequence_length, const char* data); + TimestampPattern* get_ts_pattern () { return &m_ts_pattern; } + const ParsedIrMessage& get_parsed_msg () const { return m_msg; } + LogTypeDictionaryEntry& get_msg_logtype_entry() { return m_msg.get_logtype_entry(); } + bool parse_next_encoded_message (); + +private: + + bool parse_next_four_bytes_message(); + bool parse_next_eight_bytes_message(); + bool decode_json_preamble (std::string& json_metadata); + bool is_ir_encoded (ReaderInterface& reader, bool& is_four_bytes_encoded); + + // member variables + bool m_is_four_bytes_encoded; + epochtime_t m_reference_timestamp; + TimestampPattern m_ts_pattern; + ParsedIrMessage m_msg; + ReaderInterface& m_reader; +}; + +#endif // IrMessageParser_HPP \ No newline at end of file diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index f84f01e67..affda397c 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -53,6 +53,14 @@ size_t LogTypeDictionaryEntry::get_data_size () const { m_ids_of_segments_containing_entry.size() * sizeof(segment_id_t); } +void LogTypeDictionaryEntry::set_logtype (std::string_view logtype) { + m_value = logtype; +} + +void LogTypeDictionaryEntry::set_var_positions(const std::vector& var_positions) { + m_var_positions = var_positions; +} + void LogTypeDictionaryEntry::add_constant (const string& value_containing_constant, size_t begin_pos, size_t length) { m_value.append(value_containing_constant, begin_pos, length); } diff --git a/components/core/src/LogTypeDictionaryEntry.hpp b/components/core/src/LogTypeDictionaryEntry.hpp index 85a2aabde..53d518cdd 100644 --- a/components/core/src/LogTypeDictionaryEntry.hpp +++ b/components/core/src/LogTypeDictionaryEntry.hpp @@ -86,6 +86,16 @@ class LogTypeDictionaryEntry : public DictionaryEntry { * @return Size of the data contained in this entry */ size_t get_data_size () const; + /** + * Sets the logtype from a given string_view + * @param value_containing_constant + */ + void set_logtype (std::string_view logtype); + + /** + * set_var_positions + */ + void set_var_positions(const std::vector& var_positions); /** * Adds a constant to the logtype diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp new file mode 100644 index 000000000..66bceedbd --- /dev/null +++ b/components/core/src/ParsedIrMessage.cpp @@ -0,0 +1,69 @@ +#include "ParsedIrMessage.hpp" + +// C standard libraries + +// C++ standard libraries + +// Project headers +#include "type_utils.hpp" +#include "LogTypeDictionaryEntry.hpp" + +// spdlog +#include "spdlog/spdlog.h" + +using std::string; + +void ParsedIrMessage::set_ts (epochtime_t ts) { + m_ts = ts; + if (ts != 0) { + m_orig_num_bytes += m_ts_bytes; + } +} + +void ParsedIrMessage::set_ts_pattern (const TimestampPattern* timestamp_pattern) { + if (m_ts_patt != nullptr) { + SPDLOG_ERROR("Can not set different timestamp for an IR file"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + m_ts_patt = timestamp_pattern; + // get a rough estimation of ts string size + string empty_msg; + m_ts_patt->insert_formatted_timestamp(0, empty_msg); + m_ts_bytes = empty_msg.length(); + +} + +void ParsedIrMessage::append_to_logtype (const string& value, size_t begin_pos, size_t length) { + m_logtype_entry.add_constant(value, begin_pos, length); + m_orig_num_bytes += length; +} + +void ParsedIrMessage::clear () { + m_ts_patt = nullptr; + m_ts_bytes = 0; + clear_except_ts_patt(); +} + +void ParsedIrMessage::clear_except_ts_patt () { + m_variables.clear(); + m_orig_num_bytes = 0; + m_logtype_entry.clear(); +} + +void ParsedIrMessage::add_dictionary_var (const string& dictionary_var) { + m_variables.emplace_back(dictionary_var); + m_logtype_entry.add_dictionary_var(); + m_orig_num_bytes += dictionary_var.size(); +} + +void ParsedIrMessage::add_encoded_integer (encoded_variable_t var, size_t orginal_size_in_bytes) { + m_variables.emplace_back(var); + m_logtype_entry.add_int_var(); + m_orig_num_bytes += orginal_size_in_bytes; +} + +void ParsedIrMessage::add_encoded_float (encoded_variable_t var, size_t orginal_size_in_bytes) { + m_variables.emplace_back(var); + m_logtype_entry.add_float_var(); + m_orig_num_bytes += orginal_size_in_bytes; +} \ No newline at end of file diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp new file mode 100644 index 000000000..85f88f907 --- /dev/null +++ b/components/core/src/ParsedIrMessage.hpp @@ -0,0 +1,107 @@ +#ifndef ParsedIrMessage_HPP +#define ParsedIrMessage_HPP + +// C++ standard libraries +#include +#include +#include + +// Project headers +#include "Defs.h" +#include "LogTypeDictionaryEntry.hpp" +#include "TimestampPattern.hpp" + +/** + * ParsedIRMessage represents a (potentially multiline) log message parsed from encoded ir + * into four primary fields: logtype_entry, variables, timestamp and timestamp pattern. + */ +class ParsedIrMessage { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : + TraceableException (error_code, filename, line_number) {} + + // Methods + const char* what () const noexcept override { + return "ParsedIrMessage operation failed"; + } + }; + + enum class VariableType { + EncodedVar = 0, + DictVar, + Length + }; + + // Helper class to keep variables in the order as they appear in the + // original log messages + class IrVariable { + public: + IrVariable(const std::string& dict_var) { + m_dict_var = dict_var; + m_type = VariableType::DictVar; + } + IrVariable(encoded_variable_t encoded_var) { + m_encoded_var = encoded_var; + m_type = VariableType::EncodedVar; + } + + // Methods + VariableType type() const { + return m_type; + } + + encoded_variable_t get_encoded_var () const { + assert(m_type == VariableType::EncodedVar); + return m_encoded_var; + } + + const std::string& get_dict_var () const { + assert(m_type == VariableType::DictVar); + return m_dict_var; + } + + private: + std::string m_dict_var; + encoded_variable_t m_encoded_var; + VariableType m_type; + }; + + // Construtor + ParsedIrMessage() : m_ts_patt(nullptr) {} + + // Methods + void clear(); + void clear_except_ts_patt (); + + // setter + void set_ts (epochtime_t ts); + void set_ts_pattern (const TimestampPattern* timestamp_pattern); + + // note, this logtype is already escaped + void append_to_logtype (const std::string& value, size_t begin_pos, size_t length); + void add_encoded_integer (encoded_variable_t var, size_t original_size_in_bytes); + void add_encoded_float (encoded_variable_t var, size_t original_size_in_bytes); + void add_dictionary_var (const std::string& dictionary_var); + + // getter + epochtime_t get_ts () const { return m_ts; } + LogTypeDictionaryEntry& get_logtype_entry () { return m_logtype_entry; } + const std::vector& get_vars () const { return m_variables; } + size_t get_orig_num_bytes() const { return m_orig_num_bytes; } + +private: + // Variables + const TimestampPattern* m_ts_patt; + epochtime_t m_ts; + LogTypeDictionaryEntry m_logtype_entry; + std::vector m_variables; + size_t m_orig_num_bytes; + size_t m_ts_bytes; +}; + + +#endif // ParsedIrMessage_HPP \ No newline at end of file diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 2d774a517..72bcc4006 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -12,6 +12,7 @@ #include // Project headers +#include "../IrMessageParser.hpp" #include "../Profiler.hpp" #include "utils.hpp" @@ -116,7 +117,7 @@ namespace clp { } } - m_file_reader.close(); + std::ignore = m_file_reader.close(); Profiler::stop_continuous_measurement(); LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::ParseLogFile) @@ -184,6 +185,50 @@ namespace clp { close_file_and_append_to_segment(archive_writer); } + bool FileCompressor::try_compressing_as_ir (size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + const std::string& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader) + { + // Construct the MessageParser which parse encoding type and metadata + // as part of the construction process + try { + IrMessageParser ir_message_parser(reader); + // Open compressed file + archive_writer.create_and_open_file(path_for_compression, group_id, + m_uuid_generator(), 0); + + // Assume one encoded file only has one timestamp pattern + archive_writer.change_ts_pattern(ir_message_parser.get_ts_pattern()); + + while (true) { + if (false == ir_message_parser.parse_next_encoded_message()) { + break; + } + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + split_file_and_archive(archive_user_config, path_for_compression, group_id, + ir_message_parser.get_ts_pattern(), archive_writer); + } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= + target_encoded_file_size) { + split_file(path_for_compression, group_id, ir_message_parser.get_ts_pattern(), + archive_writer); + } + const auto& parsed_msg = ir_message_parser.get_parsed_msg(); + archive_writer.write_ir_message(parsed_msg.get_ts(), + ir_message_parser.get_msg_logtype_entry(), + parsed_msg.get_vars(), + parsed_msg.get_orig_num_bytes()); + } + close_file_and_append_to_segment(archive_writer); + return true; + } catch (TraceableException& e) { + return false; + } + } + bool FileCompressor::try_compressing_as_archive (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, size_t target_encoded_file_size, const FileToCompress& file_to_compress, streaming_archive::writer::Archive& archive_writer, bool use_heuristic) @@ -260,8 +305,9 @@ namespace clp { continue; } } + auto file_path = std::string(m_libarchive_reader.get_path()); if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { - auto boost_path_for_compression = parent_boost_path / m_libarchive_reader.get_path(); + auto boost_path_for_compression = parent_boost_path / file_path; if (use_heuristic) { parse_and_encode_with_heuristic(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, boost_path_for_compression.string(), file_to_compress.get_group_id(), archive_writer, @@ -270,8 +316,26 @@ namespace clp { parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, boost_path_for_compression.string(), file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); } + } else if (IrMessageParser::is_ir_encoded(m_utf8_validation_buf_length, + m_utf8_validation_buf)) { + // Remove .clp suffix if found + if (file_path.length() > 4 && + file_path.substr(file_path.length() - 4) == ".clp") + { + file_path = file_path.substr(0, file_path.length() - 4); + } + auto boost_path_for_compression = parent_boost_path / file_path; + if (false == try_compressing_as_ir(target_data_size_of_dicts, + archive_user_config, + target_encoded_file_size, + boost_path_for_compression.string(), + file_to_compress.get_group_id(), + archive_writer, + m_libarchive_file_reader)) { + SPDLOG_ERROR("Failed to compress {} - corrupted IR", file_path); + } } else { - SPDLOG_ERROR("Cannot compress {} - not UTF-8 encoded.", m_libarchive_reader.get_path()); + SPDLOG_ERROR("Cannot compress {} - not UTF-8 or IR encoded", file_path); succeeded = false; } diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index e2fe3d92d..68ccc2154 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -10,6 +10,7 @@ #include "../LibarchiveReader.hpp" #include "../MessageParser.hpp" #include "../ParsedMessage.hpp" +#include "../ParsedIrMessage.hpp" #include "../streaming_archive/writer/Archive.hpp" #include "FileToCompress.hpp" #include "../compressor_frontend/LogParser.hpp" @@ -75,6 +76,24 @@ namespace clp { size_t target_encoded_file_size, const FileToCompress& file_to_compress, streaming_archive::writer::Archive& archive_writer, bool use_heuristic); + /** + * Parses and encodes IR from the given reader into the given archive_writer + * @param target_data_size_of_dicts + * @param archive_user_config + * @param target_encoded_file_size + * @param path_for_compression + * @param group_id + * @param archive_writer + * @param reader + */ + bool try_compressing_as_ir (size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + const std::string& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader); + // Variables boost::uuids::random_generator& m_uuid_generator; BufferedFileReader m_file_reader; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 8828f58af..c26d51320 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -72,25 +72,6 @@ template IRErrorCode parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); -/** - * Decodes the next encoded message from reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param message Returns the decoded message - * @param timestamp Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ -template -static IRErrorCode -generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp); - /** * Reads metadata information from the reader * @param reader @@ -273,8 +254,13 @@ parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_m } template -static IRErrorCode -generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { +IRErrorCode generic_parse_tokens( + ReaderInterface& reader, + string& logtype, + vector& encoded_vars, + vector& dict_vars, + epoch_time_ms_t& timestamp +) { encoded_tag_t encoded_tag; if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; @@ -284,13 +270,11 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time } // Handle variables - vector encoded_vars; - vector dict_vars; - encoded_variable_t encoded_variable; string var_str; bool is_encoded_var; while (is_variable_tag(encoded_tag, is_encoded_var)) { if (is_encoded_var) { + encoded_variable_t encoded_variable; if (false == decode_int(reader, encoded_variable)) { return IRErrorCode_Incomplete_IR; } @@ -309,7 +293,6 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time } // Handle logtype - string logtype; if (auto error_code = parse_logtype(reader, encoded_tag, logtype); IRErrorCode_Success != error_code) { @@ -326,10 +309,51 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time { return error_code; } + return IRErrorCode_Success; +} + +template +static IRErrorCode +generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { + message.clear(); + + vector encoded_vars; + vector dict_vars; + string logtype; + if (auto error_code = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); + IRErrorCode_Success != error_code) + { + return error_code; + } + + // constant handler + auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { + message.append(value, begin_pos, length); + }; + + // encoded int handler + auto encoded_int_handler + = [&message](encoded_variable_t value) { message.append(decode_integer_var(value)); }; + + // encoded float handler + auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { + message.append(decode_float_var(encoded_float)); + }; + + // dict var handler + auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; try { - message = decode_message(logtype, encoded_vars, dict_vars); - } catch (EncodingException const& e) { + generic_decode_message( + logtype, + encoded_vars, + dict_vars, + constant_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler + ); + } catch (DecodingException const& e) { return IRErrorCode_Decode_Error; } return IRErrorCode_Success; @@ -367,127 +391,6 @@ read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16 return IRErrorCode_Success; } -template -static string decode_message( - string const& logtype, - vector const& encoded_vars, - vector const& dictionary_vars -) { - string message; - size_t encoded_vars_length = encoded_vars.size(); - size_t dict_vars_length = dictionary_vars.size(); - size_t next_static_text_begin_pos = 0; - - size_t dictionary_vars_ix = 0; - size_t encoded_vars_ix = 0; - for (size_t cur_pos = 0; cur_pos < logtype.length(); ++cur_pos) { - auto c = logtype[cur_pos]; - switch (c) { - case enum_to_underlying_type(VariablePlaceholder::Float): { - message.append( - logtype, - next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos - ); - next_static_text_begin_pos = cur_pos + 1; - if (encoded_vars_ix >= encoded_vars_length) { - throw EncodingException( - ErrorCode_Corrupt, - __FILENAME__, - __LINE__, - cTooFewEncodedVarsErrorMessage - ); - } - message.append(decode_float_var(encoded_vars[encoded_vars_ix])); - ++encoded_vars_ix; - - break; - } - - case enum_to_underlying_type(VariablePlaceholder::Integer): { - message.append( - logtype, - next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos - ); - next_static_text_begin_pos = cur_pos + 1; - if (encoded_vars_ix >= encoded_vars_length) { - throw EncodingException( - ErrorCode_Corrupt, - __FILENAME__, - __LINE__, - cTooFewEncodedVarsErrorMessage - ); - } - message.append(decode_integer_var(encoded_vars[encoded_vars_ix])); - ++encoded_vars_ix; - - break; - } - - case enum_to_underlying_type(VariablePlaceholder::Dictionary): { - message.append( - logtype, - next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos - ); - next_static_text_begin_pos = cur_pos + 1; - if (dictionary_vars_ix >= dict_vars_length) { - throw EncodingException( - ErrorCode_Corrupt, - __FILENAME__, - __LINE__, - cTooFewDictionaryVarsErrorMessage - ); - } - message.append(dictionary_vars[dictionary_vars_ix]); - ++dictionary_vars_ix; - - break; - } - - case cVariablePlaceholderEscapeCharacter: { - // Ensure the escape character is followed by a character that's - // being escaped - if (cur_pos == logtype.length() - 1) { - throw EncodingException( - ErrorCode_Corrupt, - __FILENAME__, - __LINE__, - cUnexpectedEscapeCharacterMessage - ); - } - message.append( - logtype, - next_static_text_begin_pos, - cur_pos - next_static_text_begin_pos - ); - - // Skip the escape character - next_static_text_begin_pos = cur_pos + 1; - // The character after the escape character is static text - // (regardless of whether it is a variable placeholder), so - // increment cur_pos by 1 to ensure we don't process the next - // character in any of the other cases (instead it will be added - // to the message). - ++cur_pos; - - break; - } - } - } - // Add remainder - if (next_static_text_begin_pos < logtype.length()) { - message.append( - logtype, - next_static_text_begin_pos, - logtype.length() - next_static_text_begin_pos - ); - } - - return message; -} - IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) { char buffer[cProtocol::MagicNumberLength]; auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); @@ -575,4 +478,21 @@ namespace eight_byte_encoding { ); } } // namespace eight_byte_encoding + +// Explicitly declare specializations +template IRErrorCode generic_parse_tokens( + ReaderInterface& reader, + string& logtype, + vector& encoded_vars, + vector& dict_vars, + epoch_time_ms_t& timestamp +); + +template IRErrorCode generic_parse_tokens( + ReaderInterface& reader, + string& logtype, + vector& encoded_vars, + vector& dict_vars, + epoch_time_ms_t& timestamp +); } // namespace ffi::ir_stream diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index cd6b409ab..1943f1670 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -1,7 +1,7 @@ #ifndef FFI_IR_STREAM_DECODING_METHODS_HPP #define FFI_IR_STREAM_DECODING_METHODS_HPP -#include +#include #include #include "../../ReaderInterface.hpp" @@ -18,6 +18,25 @@ typedef enum { IRErrorCode_Incomplete_IR, } IRErrorCode; +class DecodingException : public TraceableException { +public: + // Constructors + DecodingException( + ErrorCode error_code, + char const* const filename, + int line_number, + std::string message + ) + : TraceableException(error_code, filename, line_number), + m_message(std::move(message)) {} + + // Methods + [[nodiscard]] char const* what() const noexcept override { return m_message.c_str(); } + +private: + std::string m_message; +}; + /** * Decodes the encoding type for the encoded IR stream * @param ir_buf @@ -29,6 +48,66 @@ typedef enum { */ IRErrorCode get_encoding_type(ReaderInterface& ir_buf, bool& is_four_bytes_encoding); +/** + * Parse logtypes, dictionary variables and encoded variables + * from the next encoded IR message. Returns the parsed tokens by + * reference + * @tparam encoded_variable_t + * @param reader + * @param logtype + * @param encoded_vars + * @param dict_vars + * @param timestamp + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + */ +template +IRErrorCode generic_parse_tokens( + ReaderInterface& reader, + std::string& logtype, + std::vector& encoded_vars, + std::vector& dict_vars, + epoch_time_ms_t& timestamp +); + +/** + * Decodes the message consists of the tokens and calls the given methods + * to handle specific components of the message. + * @tparam encoded_variable_t Type of the encoded variable + * @tparam ConstantHandler Method to handle constants. Signature: + * (const std::string&, size_t, size_t) -> void + * @tparam EncodedIntHandler Method to handle encoded integers. + * Signature: (encoded_variable_t) -> void + * @tparam EncodedFloatHandler Method to handle encoded float. + * Signature: (encoded_variable_t) -> void + * @tparam DictVarHandler Method to handle dictionary variables. + * Signature: (const std::string&) -> void + * @param logtype + * @param encoded_vars + * @param dict_vars + * @param constant_handler + * @param encoded_int_handler + * @param encoded_float_handler + * @param dict_var_handler + * @throw DecodingException if the message can not be decoded properly + */ +template < + typename encoded_variable_t, + typename ConstantHandler, + typename EncodedIntHandler, + typename EncodedFloatHandler, + typename DictVarHandler> +void generic_decode_message( + std::string const& logtype, + std::vector const& encoded_vars, + std::vector const& dict_vars, + ConstantHandler constant_handler, + EncodedIntHandler encoded_int_handler, + EncodedFloatHandler encoded_float_handler, + DictVarHandler dict_var_handler +); + /** * Decodes the preamble for an IR stream. * @param ir_buf @@ -103,4 +182,6 @@ namespace four_byte_encoding { } // namespace four_byte_encoding } // namespace ffi::ir_stream +#include "decoding_methods.tpp" + #endif // FFI_IR_STREAM_DECODING_METHODS_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.tpp b/components/core/src/ffi/ir_stream/decoding_methods.tpp new file mode 100644 index 000000000..52678e533 --- /dev/null +++ b/components/core/src/ffi/ir_stream/decoding_methods.tpp @@ -0,0 +1,144 @@ +#ifndef FFI_IR_STREAM_DECODING_METHODS_TPP +#define FFI_IR_STREAM_DECODING_METHODS_TPP + +// C++ standard libraries +#include +#include + +// Project headers +#include "../../ReaderInterface.hpp" +#include "../encoding_methods.hpp" +#include "decoding_methods.hpp" +#include "protocol_constants.hpp" + +namespace ffi::ir_stream { +template < + typename encoded_variable_t, + typename ConstantHandler, + typename EncodedIntHandler, + typename EncodedFloatHandler, + typename DictVarHandler> +void generic_decode_message( + std::string const& logtype, + std::vector const& encoded_vars, + std::vector const& dict_vars, + ConstantHandler constant_handler, + EncodedIntHandler encoded_int_handler, + EncodedFloatHandler encoded_float_handler, + DictVarHandler dict_var_handler +) { + size_t const logtype_length = logtype.length(); + size_t const encoded_vars_length = encoded_vars.size(); + size_t const dict_vars_length = dict_vars.size(); + size_t next_static_text_begin_pos = 0; + + size_t dictionary_vars_ix = 0; + size_t encoded_vars_ix = 0; + for (size_t cur_pos = 0; cur_pos < logtype_length; ++cur_pos) { + auto c = logtype[cur_pos]; + switch (c) { + case enum_to_underlying_type(VariablePlaceholder::Float): { + constant_handler( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (encoded_vars_ix >= encoded_vars_length) { + throw DecodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewEncodedVarsErrorMessage + ); + } + encoded_float_handler(encoded_vars[encoded_vars_ix]); + ++encoded_vars_ix; + + break; + } + + case enum_to_underlying_type(VariablePlaceholder::Integer): { + constant_handler( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (encoded_vars_ix >= encoded_vars_length) { + throw DecodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewEncodedVarsErrorMessage + ); + } + encoded_int_handler(encoded_vars[encoded_vars_ix]); + ++encoded_vars_ix; + + break; + } + + case enum_to_underlying_type(VariablePlaceholder::Dictionary): { + constant_handler( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + next_static_text_begin_pos = cur_pos + 1; + if (dictionary_vars_ix >= dict_vars_length) { + throw DecodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cTooFewDictionaryVarsErrorMessage + ); + } + dict_var_handler(dict_vars[dictionary_vars_ix]); + ++dictionary_vars_ix; + + break; + } + + case cVariablePlaceholderEscapeCharacter: { + // Ensure the escape character is followed by a + // character that's being escaped + if (cur_pos == logtype_length - 1) { + throw DecodingException( + ErrorCode_Corrupt, + __FILENAME__, + __LINE__, + cUnexpectedEscapeCharacterMessage + ); + } + constant_handler( + logtype, + next_static_text_begin_pos, + cur_pos - next_static_text_begin_pos + ); + + // Skip the escape character + next_static_text_begin_pos = cur_pos + 1; + // The character after the escape character is static text + // (regardless of whether it is a variable placeholder), so + // increment cur_pos by 1 to ensure we don't process the + // next character in any of the other cases (instead it will + // be added to the message). + ++cur_pos; + + break; + } + } + } + // Add remainder + if (next_static_text_begin_pos < logtype_length) { + constant_handler( + logtype, + next_static_text_begin_pos, + logtype_length - next_static_text_begin_pos + ); + } +} +} // namespace ffi::ir_stream + +#endif // FFI_IR_STREAM_DECODING_METHODS_TPP diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index ba09d4b29..3100602c0 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -269,6 +269,44 @@ namespace streaming_archive::writer { } } + void Archive::write_ir_message (epochtime_t timestamp, + LogTypeDictionaryEntry& logtype_entry, + const std::vector& variables, + size_t num_uncompressed_bytes) { + // Encode logtype + logtype_dictionary_id_t logtype_id; + m_logtype_dict.add_entry(logtype_entry, logtype_id); + + vector encoded_vars; + vector var_ids; + // Encode variable base on type + for (const auto& var : variables) { + if (var.type() == ParsedIrMessage::VariableType::EncodedVar) { + encoded_vars.push_back(var.get_encoded_var()); + } else if (var.type() == ParsedIrMessage::VariableType::DictVar) { + variable_dictionary_id_t id; + m_var_dict.add_entry(var.get_dict_var(), id); + encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + var_ids.push_back(id); + } else { + throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); + } + } + + m_file->write_encoded_msg(timestamp, logtype_id, encoded_vars, + var_ids, num_uncompressed_bytes); + + // Update segment indices + if (m_file->has_ts_pattern()) { + m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); + m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); + } else { + m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); + m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); + } + + } + void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, const bool has_timestamp) { epochtime_t timestamp = 0; diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index b48703772..cf7232fc2 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -18,6 +18,7 @@ #include "../../ErrorCode.hpp" #include "../../GlobalMetadataDB.hpp" #include "../../LogTypeDictionaryWriter.hpp" +#include "../../ParsedIrMessage.hpp" #include "../../VariableDictionaryWriter.hpp" #include "../../compressor_frontend/Token.hpp" #include "../ArchiveMetadata.hpp" @@ -128,6 +129,17 @@ namespace streaming_archive { namespace writer { * @throw FileWriter::OperationFailed if any write fails */ void write_msg (epochtime_t timestamp, const std::string& message, size_t num_uncompressed_bytes); + /** + * Encodes and writes a message to the current encoded file + * @param timestamp + * @param logtype_entry + * @param variables + * @param num_uncompressed_bytes + * @throw FileWriter::OperationFailed if any write fails + */ + void write_ir_message (epochtime_t timestamp, LogTypeDictionaryEntry& logtype_entry, + const std::vector& variables, + size_t num_uncompressed_bytes); /** * Encodes and writes a message to the given file using schema file * @param file From beb5640b617dc54fcbeef66dc05759006db38f9e Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 10 Aug 2023 18:48:14 -0400 Subject: [PATCH 078/121] Fix some clang-tidy issues --- components/core/src/ParsedIrMessage.cpp | 21 +- components/core/src/ParsedIrMessage.hpp | 71 +- .../src/ffi/ir_stream/decoding_methods.cpp | 611 +++++++++--------- 3 files changed, 337 insertions(+), 366 deletions(-) diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp index 66bceedbd..a1ead4de3 100644 --- a/components/core/src/ParsedIrMessage.cpp +++ b/components/core/src/ParsedIrMessage.cpp @@ -5,7 +5,6 @@ // C++ standard libraries // Project headers -#include "type_utils.hpp" #include "LogTypeDictionaryEntry.hpp" // spdlog @@ -13,14 +12,14 @@ using std::string; -void ParsedIrMessage::set_ts (epochtime_t ts) { +auto ParsedIrMessage::set_ts(epochtime_t ts) -> void { m_ts = ts; if (ts != 0) { m_orig_num_bytes += m_ts_bytes; } } -void ParsedIrMessage::set_ts_pattern (const TimestampPattern* timestamp_pattern) { +auto ParsedIrMessage::set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void { if (m_ts_patt != nullptr) { SPDLOG_ERROR("Can not set different timestamp for an IR file"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); @@ -30,40 +29,40 @@ void ParsedIrMessage::set_ts_pattern (const TimestampPattern* timestamp_pattern) string empty_msg; m_ts_patt->insert_formatted_timestamp(0, empty_msg); m_ts_bytes = empty_msg.length(); - } -void ParsedIrMessage::append_to_logtype (const string& value, size_t begin_pos, size_t length) { +auto ParsedIrMessage::append_to_logtype(string const& value, size_t begin_pos, size_t length) -> void { m_logtype_entry.add_constant(value, begin_pos, length); m_orig_num_bytes += length; } -void ParsedIrMessage::clear () { +auto ParsedIrMessage::clear() -> void { m_ts_patt = nullptr; m_ts_bytes = 0; clear_except_ts_patt(); } -void ParsedIrMessage::clear_except_ts_patt () { +auto ParsedIrMessage::clear_except_ts_patt() -> void { m_variables.clear(); m_orig_num_bytes = 0; + m_ts = 0; m_logtype_entry.clear(); } -void ParsedIrMessage::add_dictionary_var (const string& dictionary_var) { +auto ParsedIrMessage::add_dictionary_var(string const& dictionary_var) -> void { m_variables.emplace_back(dictionary_var); m_logtype_entry.add_dictionary_var(); m_orig_num_bytes += dictionary_var.size(); } -void ParsedIrMessage::add_encoded_integer (encoded_variable_t var, size_t orginal_size_in_bytes) { +auto ParsedIrMessage::add_encoded_integer(encoded_variable_t var, size_t orginal_size_in_bytes) -> void { m_variables.emplace_back(var); m_logtype_entry.add_int_var(); m_orig_num_bytes += orginal_size_in_bytes; } -void ParsedIrMessage::add_encoded_float (encoded_variable_t var, size_t orginal_size_in_bytes) { +auto ParsedIrMessage::add_encoded_float(encoded_variable_t var, size_t orginal_size_in_bytes) -> void { m_variables.emplace_back(var); m_logtype_entry.add_float_var(); m_orig_num_bytes += orginal_size_in_bytes; -} \ No newline at end of file +} diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp index 85f88f907..3b249cebf 100644 --- a/components/core/src/ParsedIrMessage.hpp +++ b/components/core/src/ParsedIrMessage.hpp @@ -4,6 +4,7 @@ // C++ standard libraries #include #include +#include #include // Project headers @@ -21,11 +22,11 @@ class ParsedIrMessage { class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} // Methods - const char* what () const noexcept override { + [[nodiscard]] auto what() const noexcept -> char const* override { return "ParsedIrMessage operation failed"; } }; @@ -40,68 +41,64 @@ class ParsedIrMessage { // original log messages class IrVariable { public: - IrVariable(const std::string& dict_var) { - m_dict_var = dict_var; - m_type = VariableType::DictVar; - } - IrVariable(encoded_variable_t encoded_var) { - m_encoded_var = encoded_var; - m_type = VariableType::EncodedVar; - } + explicit IrVariable(std::string dict_var) + : m_dict_var(std::move(dict_var)), + m_type(VariableType::DictVar) {} + + explicit IrVariable(encoded_variable_t encoded_var) + : m_encoded_var(encoded_var), + m_type(VariableType::EncodedVar) {} // Methods - VariableType type() const { - return m_type; - } + [[nodiscard]] auto type() const -> VariableType { return m_type; } - encoded_variable_t get_encoded_var () const { + [[nodiscard]] auto get_encoded_var() const -> encoded_variable_t { assert(m_type == VariableType::EncodedVar); return m_encoded_var; } - const std::string& get_dict_var () const { + [[nodiscard]] auto get_dict_var() const -> std::string const& { assert(m_type == VariableType::DictVar); return m_dict_var; } private: std::string m_dict_var; - encoded_variable_t m_encoded_var; + encoded_variable_t m_encoded_var{0}; VariableType m_type; }; - // Construtor - ParsedIrMessage() : m_ts_patt(nullptr) {} - // Methods void clear(); - void clear_except_ts_patt (); + void clear_except_ts_patt(); // setter - void set_ts (epochtime_t ts); - void set_ts_pattern (const TimestampPattern* timestamp_pattern); + void set_ts(epochtime_t ts); + void set_ts_pattern(TimestampPattern const* timestamp_pattern); // note, this logtype is already escaped - void append_to_logtype (const std::string& value, size_t begin_pos, size_t length); - void add_encoded_integer (encoded_variable_t var, size_t original_size_in_bytes); - void add_encoded_float (encoded_variable_t var, size_t original_size_in_bytes); - void add_dictionary_var (const std::string& dictionary_var); + void append_to_logtype(std::string const& value, size_t begin_pos, size_t length); + void add_encoded_integer(encoded_variable_t var, size_t original_size_in_bytes); + void add_encoded_float(encoded_variable_t var, size_t original_size_in_bytes); + void add_dictionary_var(std::string const& dictionary_var); // getter - epochtime_t get_ts () const { return m_ts; } - LogTypeDictionaryEntry& get_logtype_entry () { return m_logtype_entry; } - const std::vector& get_vars () const { return m_variables; } - size_t get_orig_num_bytes() const { return m_orig_num_bytes; } + [[nodiscard]] auto get_ts() const -> epochtime_t { return m_ts; } + + auto get_logtype_entry() -> LogTypeDictionaryEntry& { return m_logtype_entry; } + + [[nodiscard]] auto get_vars() const -> std::vector const& { return m_variables; } + + [[nodiscard]] auto get_orig_num_bytes() const -> size_t { return m_orig_num_bytes; } private: // Variables - const TimestampPattern* m_ts_patt; - epochtime_t m_ts; + TimestampPattern const* m_ts_patt{nullptr}; + epochtime_t m_ts{0}; LogTypeDictionaryEntry m_logtype_entry; std::vector m_variables; - size_t m_orig_num_bytes; - size_t m_ts_bytes; + size_t m_orig_num_bytes{0}; + size_t m_ts_bytes{0}; }; - -#endif // ParsedIrMessage_HPP \ No newline at end of file +#endif // ParsedIrMessage_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index c26d51320..e2870cbf2 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -8,260 +8,310 @@ using std::string; using std::vector; namespace ffi::ir_stream { -/** - * @tparam encoded_variable_t Type of the encoded variable - * @param tag - * @param is_encoded_var Returns true if tag is for an encoded variable (as - * opposed to a dictionary variable) - * @return Whether the tag is a variable tag - */ -template -static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var); - -/** - * Decodes an integer from reader - * @tparam integer_t Type of the integer to decode - * @param reader - * @param value Returns the decoded integer - * @return true on success, false if the reader doesn't contain enough data - * to decode - */ -template -static bool decode_int(ReaderInterface& reader, integer_t& value); - -/** - * Decodes the next logtype string from reader - * @param reader - * @param encoded_tag - * @param logtype Returns the logtype string - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ -static IRErrorCode -parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype); - -/** - * Decodes the next dictionary-type variable string from reader - * @param reader - * @param encoded_tag - * @param dict_var Returns the dictionary variable - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough - * data to decode - */ -static IRErrorCode -parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var); - -/** - * Parses the next timestamp from reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param encoded_tag - * @param ts Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ -template -IRErrorCode -parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); - -/** - * Reads metadata information from the reader - * @param reader - * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in reader - * @param metadata_size Returns the size of the metadata written in the IR - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ -static IRErrorCode -read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size); - -/** - * Decodes the message from the given logtype, encoded variables, and - * dictionary variables. This function properly handles escaped variable - * placeholders in the logtype, as opposed to ffi::decode_message that - * doesn't handle escaped placeholders for simplicity - * @tparam encoded_variable_t Type of the encoded variable - * @param logtype - * @param encoded_vars - * @param dictionary_vars - * @return The decoded message - * @throw EncodingException if the message can't be decoded - */ -template -static string decode_message( - string const& logtype, - vector const& encoded_vars, - vector const& dictionary_vars -); - -template -static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) { - static_assert(is_same_v || is_same_v); - if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort - || tag == cProtocol::Payload::VarStrLenInt) - { - is_encoded_var = false; - return true; - } - - if constexpr (is_same_v) { - if (tag == cProtocol::Payload::VarEightByteEncoding) { - is_encoded_var = true; - return true; - } - } else { - if (tag == cProtocol::Payload::VarFourByteEncoding) { - is_encoded_var = true; +namespace { + /** + * @tparam encoded_variable_t Type of the encoded variable + * @param tag + * @param is_encoded_var Returns true if tag is for an encoded variable (as + * opposed to a dictionary variable) + * @return Whether the tag is a variable tag + */ + template + auto is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) -> bool { + static_assert(is_same_v || is_same_v); + if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort + || tag == cProtocol::Payload::VarStrLenInt) + { + is_encoded_var = false; return true; } - } - return false; -} -template -static bool decode_int(ReaderInterface& reader, integer_t& value) { - integer_t value_little_endian; - if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { + if constexpr (is_same_v) { + if (tag == cProtocol::Payload::VarEightByteEncoding) { + is_encoded_var = true; + return true; + } + } else { + if (tag == cProtocol::Payload::VarFourByteEncoding) { + is_encoded_var = true; + return true; + } + } return false; } - constexpr auto read_size = sizeof(integer_t); - static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); - if constexpr (read_size == 1) { - value = value_little_endian; - } else if constexpr (read_size == 2) { - value = bswap_16(value_little_endian); - } else if constexpr (read_size == 4) { - value = bswap_32(value_little_endian); - } else if constexpr (read_size == 8) { - value = bswap_64(value_little_endian); + /** + * Decodes an integer from reader + * @tparam integer_t Type of the integer to decode + * @param reader + * @param value Returns the decoded integer + * @return true on success, false if the reader doesn't contain enough data + * to decode + */ + template + bool decode_int(ReaderInterface& reader, integer_t& value) { + integer_t value_little_endian; + if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { + return false; + } + + constexpr auto read_size = sizeof(integer_t); + static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); + if constexpr (read_size == 1) { + value = value_little_endian; + } else if constexpr (read_size == 2) { + value = bswap_16(value_little_endian); + } else if constexpr (read_size == 4) { + value = bswap_32(value_little_endian); + } else if constexpr (read_size == 8) { + value = bswap_64(value_little_endian); + } + return true; } - return true; -} -static IRErrorCode -parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { - size_t logtype_length; - if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { - uint8_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; + /** + * Decodes the next logtype string from reader + * @param reader + * @param encoded_tag + * @param logtype Returns the logtype string + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ + IRErrorCode parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { + size_t logtype_length; + if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { + uint8_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else { + return IRErrorCode_Corrupted_IR; } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { - uint16_t length; - if (false == decode_int(reader, length)) { + + if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { return IRErrorCode_Incomplete_IR; } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { - int32_t length; - if (false == decode_int(reader, length)) { + return IRErrorCode_Success; + } + + /** + * Decodes the next dictionary-type variable string from reader + * @param reader + * @param encoded_tag + * @param dict_var Returns the dictionary variable + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough + * data to decode + */ + IRErrorCode + parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { + // Decode variable's length + size_t var_length; + if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { + uint8_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else { + return IRErrorCode_Corrupted_IR; + } + + // Read the dictionary variable + if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { return IRErrorCode_Incomplete_IR; } - logtype_length = length; - } else { - return IRErrorCode_Corrupted_IR; - } - if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { - return IRErrorCode_Incomplete_IR; + return IRErrorCode_Success; } - return IRErrorCode_Success; -} -static IRErrorCode -parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { - // Decode variable's length - size_t var_length; - if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { - uint8_t length; - if (false == decode_int(reader, length)) { + /** + * Reads metadata information from the reader + * @param reader + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata_pos Returns the starting position of the metadata in reader + * @param metadata_size Returns the size of the metadata written in the IR + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ + IRErrorCode read_metadata_info( + ReaderInterface& reader, + encoded_tag_t& metadata_type, + uint16_t& metadata_size + ) { + if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { - uint16_t length; - if (false == decode_int(reader, length)) { + + // Read metadata length + encoded_tag_t encoded_tag; + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { - int32_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; + switch (encoded_tag) { + case cProtocol::Metadata::LengthUByte: + uint8_t ubyte_res; + if (false == decode_int(reader, ubyte_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ubyte_res; + break; + case cProtocol::Metadata::LengthUShort: + uint16_t ushort_res; + if (false == decode_int(reader, ushort_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ushort_res; + break; + default: + return IRErrorCode_Corrupted_IR; } - var_length = length; - } else { - return IRErrorCode_Corrupted_IR; + return IRErrorCode_Success; } - // Read the dictionary variable - if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { - return IRErrorCode_Incomplete_IR; - } - - return IRErrorCode_Success; -} - -template -IRErrorCode -parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { - static_assert(is_same_v || is_same_v); + /** + * Parses the next timestamp from reader + * @tparam encoded_variable_t Type of the encoded variable + * @param reader + * @param encoded_tag + * @param ts Returns the timestamp delta if + * encoded_variable_t == four_byte_encoded_variable_t or the actual + * timestamp if encoded_variable_t == eight_byte_encoded_variable_t + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode + */ + template + IRErrorCode + parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { + static_assert(is_same_v || is_same_v); - if constexpr (is_same_v) { - if (cProtocol::Payload::TimestampVal != encoded_tag) { - return IRErrorCode_Corrupted_IR; - } - if (false == decode_int(reader, ts)) { - return IRErrorCode_Incomplete_IR; - } - } else { - if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { - int8_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { - int16_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; + if constexpr (is_same_v) { + if (cProtocol::Payload::TimestampVal != encoded_tag) { + return IRErrorCode_Corrupted_IR; } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { - int32_t ts_delta; - if (false == decode_int(reader, ts_delta)) { + if (false == decode_int(reader, ts)) { return IRErrorCode_Incomplete_IR; } - ts = ts_delta; } else { - return IRErrorCode_Corrupted_IR; + if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { + int8_t ts_delta; + if (false == decode_int(reader, ts_delta)) { + return IRErrorCode_Incomplete_IR; + } + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { + int16_t ts_delta; + if (false == decode_int(reader, ts_delta)) { + return IRErrorCode_Incomplete_IR; + } + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { + int32_t ts_delta; + if (false == decode_int(reader, ts_delta)) { + return IRErrorCode_Incomplete_IR; + } + ts = ts_delta; + } else { + return IRErrorCode_Corrupted_IR; + } } + return IRErrorCode_Success; } - return IRErrorCode_Success; -} + + template + IRErrorCode generic_decode_next_message( + ReaderInterface& reader, + string& message, + epoch_time_ms_t& timestamp + ) { + message.clear(); + + vector encoded_vars; + vector dict_vars; + string logtype; + if (auto error_code + = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); + IRErrorCode_Success != error_code) + { + return error_code; + } + + // constant handler + auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { + message.append(value, begin_pos, length); + }; + + // encoded int handler + auto encoded_int_handler = [&message](encoded_variable_t value) { + message.append(decode_integer_var(value)); + }; + + // encoded float handler + auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { + message.append(decode_float_var(encoded_float)); + }; + + // dict var handler + auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; + + try { + generic_decode_message( + logtype, + encoded_vars, + dict_vars, + constant_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler + ); + } catch (DecodingException const& e) { + return IRErrorCode_Decode_Error; + } + return IRErrorCode_Success; + } +} // namespace template -IRErrorCode generic_parse_tokens( +auto generic_parse_tokens( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, epoch_time_ms_t& timestamp -) { - encoded_tag_t encoded_tag; +) -> IRErrorCode { + encoded_tag_t encoded_tag{cProtocol::Eof}; if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } @@ -271,7 +321,7 @@ IRErrorCode generic_parse_tokens( // Handle variables string var_str; - bool is_encoded_var; + bool is_encoded_var{false}; while (is_variable_tag(encoded_tag, is_encoded_var)) { if (is_encoded_var) { encoded_variable_t encoded_variable; @@ -312,96 +362,23 @@ IRErrorCode generic_parse_tokens( return IRErrorCode_Success; } -template -static IRErrorCode -generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { - message.clear(); - - vector encoded_vars; - vector dict_vars; - string logtype; - if (auto error_code = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); - IRErrorCode_Success != error_code) - { - return error_code; - } - - // constant handler - auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { - message.append(value, begin_pos, length); - }; - - // encoded int handler - auto encoded_int_handler - = [&message](encoded_variable_t value) { message.append(decode_integer_var(value)); }; - - // encoded float handler - auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { - message.append(decode_float_var(encoded_float)); - }; - - // dict var handler - auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; - - try { - generic_decode_message( - logtype, - encoded_vars, - dict_vars, - constant_handler, - encoded_int_handler, - encoded_float_handler, - dict_var_handler - ); - } catch (DecodingException const& e) { - return IRErrorCode_Decode_Error; - } - return IRErrorCode_Success; -} - -static IRErrorCode -read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size) { - if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { - return IRErrorCode_Incomplete_IR; - } - - // Read metadata length - encoded_tag_t encoded_tag; - if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { - return IRErrorCode_Incomplete_IR; - } - switch (encoded_tag) { - case cProtocol::Metadata::LengthUByte: - uint8_t ubyte_res; - if (false == decode_int(reader, ubyte_res)) { - return IRErrorCode_Incomplete_IR; - } - metadata_size = ubyte_res; - break; - case cProtocol::Metadata::LengthUShort: - uint16_t ushort_res; - if (false == decode_int(reader, ushort_res)) { - return IRErrorCode_Incomplete_IR; - } - metadata_size = ushort_res; - break; - default: - return IRErrorCode_Corrupted_IR; - } - return IRErrorCode_Success; -} - -IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) { - char buffer[cProtocol::MagicNumberLength]; - auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); +auto get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) -> IRErrorCode { + std::vector buffer(cProtocol::MagicNumberLength, '\0'); + auto error_code = reader.try_read_exact_length(buffer.data(), cProtocol::MagicNumberLength); if (error_code != ErrorCode_Success) { return IRErrorCode_Incomplete_IR; } - if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, cProtocol::MagicNumberLength)) { + if (0 + == memcmp( + buffer.data(), + cProtocol::FourByteEncodingMagicNumber, + cProtocol::MagicNumberLength + )) + { is_four_bytes_encoding = true; } else if ((0 == memcmp( - buffer, + buffer.data(), cProtocol::EightByteEncodingMagicNumber, cProtocol::MagicNumberLength ))) @@ -413,12 +390,12 @@ IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encod return IRErrorCode_Success; } -IRErrorCode decode_preamble( +auto decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size -) { +) -> IRErrorCode { if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { @@ -431,12 +408,12 @@ IRErrorCode decode_preamble( return IRErrorCode_Success; } -IRErrorCode decode_preamble( +auto decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, std::vector& metadata -) { - uint16_t metadata_size; +) -> IRErrorCode { + uint16_t metadata_size{0}; if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { @@ -455,11 +432,9 @@ IRErrorCode decode_preamble( } namespace four_byte_encoding { - IRErrorCode decode_next_message( - ReaderInterface& reader, - string& message, - epoch_time_ms_t& timestamp_delta - ) { + auto + decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp_delta) + -> IRErrorCode { return generic_decode_next_message( reader, message, @@ -469,8 +444,8 @@ namespace four_byte_encoding { } // namespace four_byte_encoding namespace eight_byte_encoding { - IRErrorCode - decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { + auto decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) + -> IRErrorCode { return generic_decode_next_message( reader, message, @@ -480,19 +455,19 @@ namespace eight_byte_encoding { } // namespace eight_byte_encoding // Explicitly declare specializations -template IRErrorCode generic_parse_tokens( +template auto generic_parse_tokens( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, epoch_time_ms_t& timestamp -); +) -> IRErrorCode; -template IRErrorCode generic_parse_tokens( +template auto generic_parse_tokens( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, epoch_time_ms_t& timestamp -); +) -> IRErrorCode; } // namespace ffi::ir_stream From f49d7c1babd8586ca9d73c5ca005bd0e1f846a9a Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 11 Aug 2023 00:15:46 -0400 Subject: [PATCH 079/121] more clean up --- .../core/src/EncodedVariableInterpreter.cpp | 41 +++++---- .../core/src/EncodedVariableInterpreter.hpp | 11 ++- components/core/src/IrMessageParser.cpp | 85 ++++++++++--------- components/core/src/IrMessageParser.hpp | 19 ++--- components/core/src/ParsedIrMessage.hpp | 16 ++-- 5 files changed, 91 insertions(+), 81 deletions(-) diff --git a/components/core/src/EncodedVariableInterpreter.cpp b/components/core/src/EncodedVariableInterpreter.cpp index 8cf7fb365..22fbba4fd 100644 --- a/components/core/src/EncodedVariableInterpreter.cpp +++ b/components/core/src/EncodedVariableInterpreter.cpp @@ -195,11 +195,11 @@ void EncodedVariableInterpreter::convert_encoded_float_to_string (encoded_variab value[value_length - 1 - decimal_pos] = '.'; } -encoded_variable_t -EncodedVariableInterpreter::convert_four_bytes_float_to_clp_encoded_float ( - encoded_variable_t encoded_float) -{ - encoded_float = bit_cast(encoded_float); +void EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte( + encoded_variable_t four_bytes_float, + encoded_variable_t& eight_bytes_float +) { + four_bytes_float = bit_cast(four_bytes_float); size_t decimal_pos; size_t num_digits; @@ -207,26 +207,25 @@ EncodedVariableInterpreter::convert_four_bytes_float_to_clp_encoded_float ( bool is_negative; // Decode according to the format described in encode_string_as_float_compact_var - decimal_pos = (encoded_float & 0x07) + 1; - encoded_float >>= 3; - num_digits = (encoded_float & 0x07) + 1; - encoded_float >>= 3; - digits = encoded_float & ffi::cFourByteEncodedFloatDigitsBitMask; - encoded_float >>= 25; - is_negative = encoded_float > 0; + decimal_pos = (four_bytes_float & 0x07) + 1; + four_bytes_float >>= 3; + num_digits = (four_bytes_float & 0x07) + 1; + four_bytes_float >>= 3; + digits = four_bytes_float & ffi::cFourByteEncodedFloatDigitsBitMask; + four_bytes_float >>= 25; + is_negative = four_bytes_float > 0; // encode again. - uint64_t clp_encoded_float = 0; + eight_bytes_float = 0; if (is_negative) { - clp_encoded_float = 1; + eight_bytes_float = 1; } - clp_encoded_float <<= 55; // 1 unused + 54 for digits of the float - clp_encoded_float |= digits & cEightByteEncodedFloatDigitsBitMask; - clp_encoded_float <<= 4; - clp_encoded_float |= (num_digits - 1) & 0x0F; - clp_encoded_float <<= 4; - clp_encoded_float |= (decimal_pos - 1) & 0x0F; - return bit_cast(clp_encoded_float); + eight_bytes_float <<= 55; // 1 unused + 54 for digits of the float + eight_bytes_float |= digits & cEightByteEncodedFloatDigitsBitMask; + eight_bytes_float <<= 4; + eight_bytes_float |= (num_digits - 1) & 0x0F; + eight_bytes_float <<= 4; + eight_bytes_float |= (decimal_pos - 1) & 0x0F; } void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& message, LogTypeDictionaryEntry& logtype_dict_entry, diff --git a/components/core/src/EncodedVariableInterpreter.hpp b/components/core/src/EncodedVariableInterpreter.hpp index 1b041ad84..586229f1d 100644 --- a/components/core/src/EncodedVariableInterpreter.hpp +++ b/components/core/src/EncodedVariableInterpreter.hpp @@ -62,8 +62,15 @@ class EncodedVariableInterpreter { * @param value */ static void convert_encoded_float_to_string (encoded_variable_t encoded_var, std::string& value); - - static encoded_variable_t convert_four_bytes_float_to_clp_encoded_float (encoded_variable_t var); + /** + * Converts the four bytes encoded float to eight byte encoded float + * @param four_bytes_float + * @param eight_bytes_float + */ + static void convert_four_bytes_float_to_eight_byte( + encoded_variable_t four_bytes_float, + encoded_variable_t& eight_bytes_float + ); /** * Parses all variables from a message (while constructing the logtype) and encodes them (adding them to the variable dictionary if necessary) * @param message diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp index 8273020c3..d3872e57d 100644 --- a/components/core/src/IrMessageParser.cpp +++ b/components/core/src/IrMessageParser.cpp @@ -1,9 +1,5 @@ #include "IrMessageParser.hpp" -// C standard libraries - -// C++ standard libraries - // Project headers #include "BufferReader.hpp" #include "EncodedVariableInterpreter.hpp" @@ -25,6 +21,24 @@ using ffi::VariablePlaceholder; using std::string; using std::vector; +namespace { +/** + * Decodes Ir header from the reader and return its encoding type by reference + * @param reader + * @param is_four_bytes_encoded Returns the encoding type + * or Ir header that can't be properly decoded + */ +[[nodiscard]] auto decode_ir_magic_number(ReaderInterface& reader, bool& is_four_bytes_encoded) -> bool { + // Note. On failure, this method doesn't recover file pos. + if (ffi::ir_stream::IRErrorCode_Success + != ffi::ir_stream::get_encoding_type(reader, is_four_bytes_encoded)) + { + return false; + } + return true; +} +} // namespace + /** * Constructs the class by setting the internal reader, parsing the metadata * and initializing variable based on the metadata @@ -33,8 +47,7 @@ using std::vector; * or IR data that can't be properly decoded */ IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { - - if (false == is_ir_encoded(m_reader, m_is_four_bytes_encoded)) { + if (false == decode_ir_magic_number(m_reader, m_is_four_bytes_encoded)) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } @@ -68,17 +81,17 @@ IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { m_msg.set_ts_pattern(&m_ts_pattern); } -bool IrMessageParser::parse_next_encoded_message () { +auto IrMessageParser::parse_next_encoded_message () -> bool { if (m_is_four_bytes_encoded) { return parse_next_four_bytes_message(); } return parse_next_eight_bytes_message(); } -bool IrMessageParser::parse_next_eight_bytes_message () { +auto IrMessageParser::parse_next_eight_bytes_message () -> bool { m_msg.clear(); - epochtime_t ts; + epochtime_t ts{0}; vector encoded_vars; vector dict_vars; string logtype; @@ -126,10 +139,10 @@ bool IrMessageParser::parse_next_eight_bytes_message () { return true; } -bool IrMessageParser::parse_next_four_bytes_message () { +auto IrMessageParser::parse_next_four_bytes_message () -> bool { m_msg.clear(); - epochtime_t ts; + epochtime_t ts{0}; vector encoded_vars; vector dict_vars; string logtype; @@ -156,13 +169,14 @@ bool IrMessageParser::parse_next_four_bytes_message () { }; auto encoded_float_handler = [this] (four_byte_encoded_variable_t encoded_float) { - auto decoded_float = ffi::decode_float_var(encoded_float); - auto converted_float = EncodedVariableInterpreter::convert_four_bytes_float_to_clp_encoded_float(encoded_float); - m_msg.add_encoded_float(converted_float, decoded_float.size()); + const auto original_size_in_bytes = ffi::decode_float_var(encoded_float).size(); + eight_byte_encoded_variable_t converted_float {0}; + EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte(encoded_float, converted_float); + m_msg.add_encoded_float(converted_float, original_size_in_bytes); }; auto dict_var_handler = [this] (const string& dict_var) { - encoded_variable_t converted_var; + encoded_variable_t converted_var{0}; if (EncodedVariableInterpreter::convert_string_to_representable_integer_var(dict_var, converted_var)) { m_msg.add_encoded_integer(converted_var, dict_var.size()); } else if (EncodedVariableInterpreter::convert_string_to_representable_float_var(dict_var, converted_var)) { @@ -187,31 +201,9 @@ bool IrMessageParser::parse_next_four_bytes_message () { return true; } -bool IrMessageParser::is_ir_encoded (ReaderInterface& reader, bool& is_four_bytes_encoded) { - // Note. currently this method doesn't recover file pos. - if (ffi::ir_stream::IRErrorCode_Success != - ffi::ir_stream::get_encoding_type(reader, is_four_bytes_encoded)) { - return false; - } - return true; -} - -bool IrMessageParser::is_ir_encoded (size_t sequence_length, const char* data) { - if (sequence_length < MagicNumberLength) { - return false; - } - bool is_four_bytes_encoded; - BufferReader encoding_data (data, MagicNumberLength); - if (ffi::ir_stream::IRErrorCode_Success != - ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) { - return false; - } - return true; -} - -bool IrMessageParser::decode_json_preamble (std::string& json_metadata) { +auto IrMessageParser::decode_json_preamble (std::string& json_metadata) -> bool { // Decode and parse metadata - ffi::ir_stream::encoded_tag_t metadata_type; + ffi::ir_stream::encoded_tag_t metadata_type{0}; std::vector metadata_vec; if (ffi::ir_stream::IRErrorCode_Success != @@ -225,8 +217,21 @@ bool IrMessageParser::decode_json_preamble (std::string& json_metadata) { return false; } - json_metadata.assign(reinterpret_cast(metadata_vec.data()), + json_metadata.assign(size_checked_pointer_cast(metadata_vec.data()), metadata_vec.size()); return true; +} + +auto IrMessageParser::is_ir_encoded (size_t sequence_length, const char* data) -> bool { + if (sequence_length < MagicNumberLength) { + return false; + } + bool is_four_bytes_encoded{false}; + BufferReader encoding_data (data, MagicNumberLength); + if (ffi::ir_stream::IRErrorCode_Success != + ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) { + return false; + } + return true; } \ No newline at end of file diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp index 519e95ac1..392137fef 100644 --- a/components/core/src/IrMessageParser.hpp +++ b/components/core/src/IrMessageParser.hpp @@ -25,26 +25,25 @@ class IrMessageParser { TraceableException (error_code, filename, line_number) {} // Methods - const char* what () const noexcept override { + [[nodiscard]] auto what () const noexcept -> const char* override { return "IrMessageParser operation failed"; } }; // Constructor - IrMessageParser (ReaderInterface& reader); + explicit IrMessageParser (ReaderInterface& reader); // Methods + auto get_ts_pattern () -> TimestampPattern* { return &m_ts_pattern; } + [[nodiscard]] auto get_parsed_msg () const -> const ParsedIrMessage& { return m_msg; } + auto get_msg_logtype_entry() -> LogTypeDictionaryEntry& { return m_msg.get_logtype_entry(); } + [[nodiscard]] auto parse_next_encoded_message () -> bool; static bool is_ir_encoded (size_t sequence_length, const char* data); - TimestampPattern* get_ts_pattern () { return &m_ts_pattern; } - const ParsedIrMessage& get_parsed_msg () const { return m_msg; } - LogTypeDictionaryEntry& get_msg_logtype_entry() { return m_msg.get_logtype_entry(); } - bool parse_next_encoded_message (); private: - bool parse_next_four_bytes_message(); - bool parse_next_eight_bytes_message(); - bool decode_json_preamble (std::string& json_metadata); - bool is_ir_encoded (ReaderInterface& reader, bool& is_four_bytes_encoded); + [[nodiscard]] auto parse_next_four_bytes_message() -> bool; + [[nodiscard]] auto parse_next_eight_bytes_message() -> bool; + [[nodiscard]] auto decode_json_preamble (std::string& json_metadata) -> bool; // member variables bool m_is_four_bytes_encoded; diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp index 3b249cebf..b5fd28c5e 100644 --- a/components/core/src/ParsedIrMessage.hpp +++ b/components/core/src/ParsedIrMessage.hpp @@ -69,18 +69,18 @@ class ParsedIrMessage { }; // Methods - void clear(); - void clear_except_ts_patt(); + auto clear() -> void; + auto clear_except_ts_patt() -> void; // setter - void set_ts(epochtime_t ts); - void set_ts_pattern(TimestampPattern const* timestamp_pattern); + auto set_ts(epochtime_t ts) -> void; + auto set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void; // note, this logtype is already escaped - void append_to_logtype(std::string const& value, size_t begin_pos, size_t length); - void add_encoded_integer(encoded_variable_t var, size_t original_size_in_bytes); - void add_encoded_float(encoded_variable_t var, size_t original_size_in_bytes); - void add_dictionary_var(std::string const& dictionary_var); + auto append_to_logtype(std::string const& value, size_t begin_pos, size_t length) -> void; + auto add_encoded_integer(encoded_variable_t var, size_t original_size_in_bytes) -> void; + auto add_encoded_float(encoded_variable_t var, size_t original_size_in_bytes) -> void; + auto add_dictionary_var(std::string const& dictionary_var) -> void; // getter [[nodiscard]] auto get_ts() const -> epochtime_t { return m_ts; } From a3fbf31bc48df0dffde439b66820d0faa30657d9 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:51:25 -0400 Subject: [PATCH 080/121] Write missing docstrings and minor refactoring. --- components/core/src/BufferReader.cpp | 12 ++++---- components/core/src/BufferReader.hpp | 42 ++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 76bb3b11a..23f3c5015 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -1,6 +1,5 @@ #include "BufferReader.hpp" -// C++ standard libraries #include #include @@ -51,11 +50,11 @@ auto BufferReader::try_read_to_delimiter( bool append, std::string& str ) -> ErrorCode { - bool found_delim{false}; - size_t num_bytes_read{0}; if (false == append) { str.clear(); } + bool found_delim{false}; + size_t num_bytes_read{0}; return try_read_to_delimiter(delim, keep_delimiter, str, found_delim, num_bytes_read); } @@ -76,10 +75,10 @@ auto BufferReader::try_read_to_delimiter( if (0 == remaining_data_size) { return ErrorCode_EndOfFile; } + // Find the delimiter - char const* buffer_head = m_internal_buf + m_internal_buf_pos; - char const* delim_ptr - = static_cast(memchr(buffer_head, delim, remaining_data_size)); + auto const* buffer_head = m_internal_buf + m_internal_buf_pos; + auto* delim_ptr = static_cast(memchr(buffer_head, delim, remaining_data_size)); size_t delim_pos{0}; if (delim_ptr != nullptr) { @@ -93,7 +92,6 @@ auto BufferReader::try_read_to_delimiter( delim_pos = m_internal_buf_size; num_bytes_read = remaining_data_size; } - // append to strings str.append(buffer_head, num_bytes_read); m_internal_buf_pos = delim_pos; return ErrorCode_Success; diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 5b7cacea4..0e0325aff 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -1,7 +1,6 @@ #ifndef BUFFERREADER_HPP #define BUFFERREADER_HPP -// Project headers #include "ReaderInterface.hpp" /** @@ -31,18 +30,18 @@ class BufferReader : public ReaderInterface { /** * Tries to read up to a given number of bytes from the buffer * @param buf - * @param num_bytes_to_read The number of bytes to try and read - * @param num_bytes_read The actual number of bytes read - * @return ErrorCode_BadParam if buf is invalid - * @return ErrorCode_EndOfFile if buffer doesn't contain more data + * @param num_bytes_to_read + * @param num_bytes_read Returns the number of bytes read + * @return ErrorCode_EndOfFile if the buffer doesn't contain any more data * @return ErrorCode_Success on success */ [[nodiscard]] auto try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override; /** - * Tries to seek from the beginning of the buffer to the given position + * Tries to seek to the given position, relative to the beginning of the + * buffer * @param pos - * @return ErrorCode_OutOfBounds if the given position > the buffer's size + * @return ErrorCode_Truncated if \p pos > the buffer's size * @return ErrorCode_Success on success */ [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; @@ -52,15 +51,40 @@ class BufferReader : public ReaderInterface { */ [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override; + /** + * Tries to read up to an occurrence of the given delimiter + * @param delim + * @param keep_delimiter Whether to include the delimiter in the output + * string + * @param append Whether to append to the given string or replace its + * contents + * @param str Returns the content read from the buffer + * @return Same as BufferReader::try_read_to_delimiter(char, bool, + * std::string&, bool&, size_t&) + */ [[nodiscard]] auto try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) -> ErrorCode override; - // Helper functions [[nodiscard]] auto get_buffer_size() const -> size_t { return m_internal_buf_size; } + /** + * @param buf Returns a pointer to the remaining content in the buffer + * @param peek_size Returns the size of the remaining content in the buffer + */ auto peek_buffer(char const*& buf, size_t& peek_size) -> void; + /** + * Tries to read up to an occurrence of the given delimiter + * @param delim + * @param keep_delimiter Whether to include the delimiter in the output + * string + * @param str Returns the content read from the buffer + * @param found_delim Whether a delimiter was found + * @param num_bytes_read How many bytes were read from the buffer + * @return ErrorCode_EndOfFile if the buffer doesn't contain any more data + * @return ErrorCode_Success on success + */ auto try_read_to_delimiter( char delim, bool keep_delimiter, @@ -70,7 +94,7 @@ class BufferReader : public ReaderInterface { ) -> ErrorCode; private: - // Method + // Methods [[nodiscard]] auto get_remaining_data_size() const -> size_t { return m_internal_buf_size - m_internal_buf_pos; } From 3e9f564125f4f5df23da056f4716d3592c322bf0 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 11 Aug 2023 20:24:21 -0400 Subject: [PATCH 081/121] Return num_bytes_read correctly --- components/core/src/BufferReader.cpp | 7 +++++-- components/core/src/BufferedFileReader.cpp | 20 ++++++++++++-------- components/core/src/clp/FileCompressor.cpp | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 76bb3b11a..d75a08778 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -82,19 +82,22 @@ auto BufferReader::try_read_to_delimiter( = static_cast(memchr(buffer_head, delim, remaining_data_size)); size_t delim_pos{0}; + size_t append_length{0}; if (delim_ptr != nullptr) { delim_pos = (delim_ptr - m_internal_buf) + 1; num_bytes_read = delim_pos - m_internal_buf_pos; + append_length = num_bytes_read; if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { - --num_bytes_read; + --append_length; } found_delim = true; } else { delim_pos = m_internal_buf_size; num_bytes_read = remaining_data_size; + append_length = num_bytes_read; } // append to strings - str.append(buffer_head, num_bytes_read); + str.append(buffer_head, append_length); m_internal_buf_pos = delim_pos; return ErrorCode_Success; } diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index c282786f7..b8f6e3d85 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -161,23 +161,27 @@ auto BufferedFileReader::try_read_to_delimiter( str.clear(); } bool found_delim{false}; - size_t total_append_length{0}; + size_t total_num_bytes_read{0}; while (false == found_delim) { - size_t length{0}; - if (auto ret_code - = m_buffer_reader - ->try_read_to_delimiter(delim, keep_delimiter, str, found_delim, length); + size_t num_bytes_read{0}; + if (auto ret_code = m_buffer_reader->try_read_to_delimiter( + delim, + keep_delimiter, + str, + found_delim, + num_bytes_read + ); ret_code != ErrorCode_Success && ret_code != ErrorCode_EndOfFile) { return ret_code; } - update_file_pos(m_file_pos + length); - total_append_length += length; + update_file_pos(m_file_pos + num_bytes_read); + total_num_bytes_read += num_bytes_read; if (false == found_delim) { auto error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { - if (total_append_length == 0) { + if (total_num_bytes_read == 0) { return ErrorCode_EndOfFile; } return ErrorCode_Success; diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 2d774a517..4b6270ba8 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -116,7 +116,7 @@ namespace clp { } } - m_file_reader.close(); + std::ignore = m_file_reader.close(); Profiler::stop_continuous_measurement(); LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::ParseLogFile) From 1c53b57f4a1fc15de1b280893a33d3f515a1c992 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 12 Aug 2023 06:58:12 -0400 Subject: [PATCH 082/121] Clean-up BufferReader::try_read_to_delimiter --- components/core/src/BufferReader.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 5d04d6d1d..7158026a6 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -78,24 +78,23 @@ auto BufferReader::try_read_to_delimiter( // Find the delimiter auto const* buffer_head = m_internal_buf + m_internal_buf_pos; - auto* delim_ptr = static_cast(memchr(buffer_head, delim, remaining_data_size)); + auto const* delim_ptr + = static_cast(memchr(buffer_head, delim, remaining_data_size)); - size_t delim_pos{0}; size_t append_length{0}; if (delim_ptr != nullptr) { - delim_pos = (delim_ptr - m_internal_buf) + 1; - num_bytes_read = delim_pos - m_internal_buf_pos; + auto const delim_pos{delim_ptr - m_internal_buf}; + num_bytes_read = (delim_pos - m_internal_buf_pos) + 1; append_length = num_bytes_read; - if (false == keep_delimiter && delim == m_internal_buf[delim_pos - 1]) { + if (false == keep_delimiter) { --append_length; } found_delim = true; } else { - delim_pos = m_internal_buf_size; num_bytes_read = remaining_data_size; append_length = num_bytes_read; } str.append(buffer_head, append_length); - m_internal_buf_pos = delim_pos; + m_internal_buf_pos += num_bytes_read; return ErrorCode_Success; } From 2fe5c7760b994f88a4b7d31dc8bbee5af5df06d5 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 12 Aug 2023 07:33:04 -0400 Subject: [PATCH 083/121] BufferReader: Reorder methods according to guidelines. --- components/core/src/BufferReader.cpp | 82 ++++++++++++++-------------- components/core/src/BufferReader.hpp | 57 ++++++++++--------- 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 7158026a6..54ad250d6 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -12,6 +12,47 @@ BufferReader::BufferReader(char const* data, size_t data_size, size_t pos) { m_internal_buf_pos = pos; } +auto BufferReader::peek_buffer(char const*& buf, size_t& peek_size) -> void { + peek_size = get_remaining_data_size(); + buf = m_internal_buf + m_internal_buf_pos; +} + +auto BufferReader::try_read_to_delimiter( + char delim, + bool keep_delimiter, + std::string& str, + bool& found_delim, + size_t& num_bytes_read +) -> ErrorCode { + found_delim = false; + auto const remaining_data_size = get_remaining_data_size(); + if (0 == remaining_data_size) { + return ErrorCode_EndOfFile; + } + + // Find the delimiter + auto const* buffer_head = m_internal_buf + m_internal_buf_pos; + auto const* delim_ptr + = static_cast(memchr(buffer_head, delim, remaining_data_size)); + + size_t append_length{0}; + if (delim_ptr != nullptr) { + auto const delim_pos{delim_ptr - m_internal_buf}; + num_bytes_read = (delim_pos - m_internal_buf_pos) + 1; + append_length = num_bytes_read; + if (false == keep_delimiter) { + --append_length; + } + found_delim = true; + } else { + num_bytes_read = remaining_data_size; + append_length = num_bytes_read; + } + str.append(buffer_head, append_length); + m_internal_buf_pos += num_bytes_read; + return ErrorCode_Success; +} + auto BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode { if (nullptr == buf && num_bytes_to_read > 0) { @@ -57,44 +98,3 @@ auto BufferReader::try_read_to_delimiter( size_t num_bytes_read{0}; return try_read_to_delimiter(delim, keep_delimiter, str, found_delim, num_bytes_read); } - -auto BufferReader::peek_buffer(char const*& buf, size_t& peek_size) -> void { - peek_size = get_remaining_data_size(); - buf = m_internal_buf + m_internal_buf_pos; -} - -auto BufferReader::try_read_to_delimiter( - char delim, - bool keep_delimiter, - std::string& str, - bool& found_delim, - size_t& num_bytes_read -) -> ErrorCode { - found_delim = false; - auto const remaining_data_size = get_remaining_data_size(); - if (0 == remaining_data_size) { - return ErrorCode_EndOfFile; - } - - // Find the delimiter - auto const* buffer_head = m_internal_buf + m_internal_buf_pos; - auto const* delim_ptr - = static_cast(memchr(buffer_head, delim, remaining_data_size)); - - size_t append_length{0}; - if (delim_ptr != nullptr) { - auto const delim_pos{delim_ptr - m_internal_buf}; - num_bytes_read = (delim_pos - m_internal_buf_pos) + 1; - append_length = num_bytes_read; - if (false == keep_delimiter) { - --append_length; - } - found_delim = true; - } else { - num_bytes_read = remaining_data_size; - append_length = num_bytes_read; - } - str.append(buffer_head, append_length); - m_internal_buf_pos += num_bytes_read; - return ErrorCode_Success; -} diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 0e0325aff..337bcb88a 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -26,6 +26,34 @@ class BufferReader : public ReaderInterface { BufferReader(char const* data, size_t data_size, size_t pos); + // Methods + [[nodiscard]] auto get_buffer_size() const -> size_t { return m_internal_buf_size; } + + /** + * @param buf Returns a pointer to the remaining content in the buffer + * @param peek_size Returns the size of the remaining content in the buffer + */ + auto peek_buffer(char const*& buf, size_t& peek_size) -> void; + + /** + * Tries to read up to an occurrence of the given delimiter + * @param delim + * @param keep_delimiter Whether to include the delimiter in the output + * string + * @param str Returns the content read from the buffer + * @param found_delim Whether a delimiter was found + * @param num_bytes_read How many bytes were read from the buffer + * @return ErrorCode_EndOfFile if the buffer doesn't contain any more data + * @return ErrorCode_Success on success + */ + auto try_read_to_delimiter( + char delim, + bool keep_delimiter, + std::string& str, + bool& found_delim, + size_t& num_bytes_read + ) -> ErrorCode; + // Methods implementing the ReaderInterface /** * Tries to read up to a given number of bytes from the buffer @@ -37,6 +65,7 @@ class BufferReader : public ReaderInterface { */ [[nodiscard]] auto try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override; + /** * Tries to seek to the given position, relative to the beginning of the * buffer @@ -45,6 +74,7 @@ class BufferReader : public ReaderInterface { * @return ErrorCode_Success on success */ [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; + /** * @param pos Returns the position of the read head in the buffer * @return ErrorCode_Success @@ -66,33 +96,6 @@ class BufferReader : public ReaderInterface { try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) -> ErrorCode override; - [[nodiscard]] auto get_buffer_size() const -> size_t { return m_internal_buf_size; } - - /** - * @param buf Returns a pointer to the remaining content in the buffer - * @param peek_size Returns the size of the remaining content in the buffer - */ - auto peek_buffer(char const*& buf, size_t& peek_size) -> void; - - /** - * Tries to read up to an occurrence of the given delimiter - * @param delim - * @param keep_delimiter Whether to include the delimiter in the output - * string - * @param str Returns the content read from the buffer - * @param found_delim Whether a delimiter was found - * @param num_bytes_read How many bytes were read from the buffer - * @return ErrorCode_EndOfFile if the buffer doesn't contain any more data - * @return ErrorCode_Success on success - */ - auto try_read_to_delimiter( - char delim, - bool keep_delimiter, - std::string& str, - bool& found_delim, - size_t& num_bytes_read - ) -> ErrorCode; - private: // Methods [[nodiscard]] auto get_remaining_data_size() const -> size_t { From 36508b7fedc026ef37844ed9025db51679d72d7e Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 12 Aug 2023 21:45:21 -0400 Subject: [PATCH 084/121] Libarchive*: Some clean-up. --- components/core/src/LibarchiveFileReader.cpp | 9 ++++----- components/core/src/LibarchiveFileReader.hpp | 9 +++------ components/core/src/LibarchiveReader.cpp | 2 +- components/core/src/LibarchiveReader.hpp | 4 ++-- components/core/src/clp/FileCompressor.cpp | 6 ++++-- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 1416db47b..db488475b 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -167,8 +167,7 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter (char delim, bool keep_del return ErrorCode_Success; } -ErrorCode LibarchiveFileReader::try_peek_data_block(const char*&buf, - size_t&buf_size) { +ErrorCode LibarchiveFileReader::try_peek_buffered_data(char const*& buf, size_t& buf_size) { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -190,16 +189,16 @@ ErrorCode LibarchiveFileReader::try_peek_data_block(const char*&buf, // Position in the file is within the data block, so we can directly // return a const pointer to the current data block buf_size = m_data_block_length - m_pos_in_data_block; - buf = static_cast(m_data_block); + buf = static_cast(m_data_block); return ErrorCode_Success; } auto num_sparse_bytes = m_data_block_pos_in_file - m_pos_in_file; buf_size = num_sparse_bytes + m_data_block_length; m_data_for_peek.resize(buf_size, '\0'); - buf = static_cast(m_data_for_peek.data()); + buf = static_cast(m_data_for_peek.data()); - size_t remaining_bytes_to_peek = buf_size - num_sparse_bytes; + size_t const remaining_bytes_to_peek = buf_size - num_sparse_bytes; memcpy(&m_data_for_peek[num_sparse_bytes], m_data_block, remaining_bytes_to_peek); return ErrorCode_Success; diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index f4544a7a3..063e83f72 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -31,8 +31,7 @@ class LibarchiveFileReader : public ReaderInterface { }; // Constructors - LibarchiveFileReader () : m_archive(nullptr), m_archive_entry(nullptr), m_data_block(nullptr), - m_reached_eof(false), m_pos_in_file(0) {} + LibarchiveFileReader () : m_archive(nullptr), m_archive_entry(nullptr), m_data_block(nullptr), m_reached_eof(false), m_pos_in_file(0) {} // Methods implementing the ReaderInterface /** @@ -72,15 +71,13 @@ class LibarchiveFileReader : public ReaderInterface { ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; /** - * Tries to peek from the next data block and returns the available - * data size - * @param buf + * @param buf Returns a pointer to any buffered data * @param buf_size Returns the number of bytes in the buffer * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_peek_data_block(const char*&buf, size_t&buf_size); + [[nodiscard]] ErrorCode try_peek_buffered_data(char const*& buf, size_t& buf_size); // Methods /** diff --git a/components/core/src/LibarchiveReader.cpp b/components/core/src/LibarchiveReader.cpp index 7fc2bcac4..3a61d7efc 100644 --- a/components/core/src/LibarchiveReader.cpp +++ b/components/core/src/LibarchiveReader.cpp @@ -7,7 +7,7 @@ #include "Defs.h" #include "spdlog_with_specializations.hpp" -ErrorCode LibarchiveReader::try_open (ReaderInterface&reader, const std::string& path_if_compressed_file) { +ErrorCode LibarchiveReader::try_open (ReaderInterface& reader, const std::string& path_if_compressed_file) { // Create and initialize internal libarchive m_archive = archive_read_new(); if (nullptr == m_archive) { diff --git a/components/core/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp index 77a6a5dff..734bc67a9 100644 --- a/components/core/src/LibarchiveReader.hpp +++ b/components/core/src/LibarchiveReader.hpp @@ -42,7 +42,7 @@ class LibarchiveReader { // Methods /** - * Tries to open the archive or compressed file contained in the reader + * Tries to open the archive or compressed file from the given reader * @param reader * @param path_if_compressed_file Path to use if the data is a single compressed file * @return ErrorCode_Success on success @@ -148,7 +148,7 @@ class LibarchiveReader { struct archive_entry* m_archive_entry; std::vector m_buffer; - ReaderInterface*m_reader; + ReaderInterface* m_reader; std::string m_filename_if_compressed; diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 4b6270ba8..75618a417 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -248,8 +248,10 @@ namespace clp { } m_libarchive_reader.open_file_reader(m_libarchive_file_reader); - error_code = m_libarchive_file_reader.try_peek_data_block( - m_utf8_validation_buf, m_utf8_validation_buf_length); + error_code = m_libarchive_file_reader.try_peek_buffered_data( + m_utf8_validation_buf, + m_utf8_validation_buf_length + ); // Check that file is UTF-8 encoded if (ErrorCode_Success != error_code) { if (ErrorCode_EndOfFile != error_code) { From 47a52186044cb43b586956bb7f745f604162ef0d Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 12 Aug 2023 23:41:44 -0400 Subject: [PATCH 085/121] Undo unnecessary changes to ffi/ir_stream/encoding_methods.hpp --- .../src/ffi/ir_stream/encoding_methods.hpp | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/components/core/src/ffi/ir_stream/encoding_methods.hpp b/components/core/src/ffi/ir_stream/encoding_methods.hpp index 604968e02..f795bfba4 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.hpp @@ -13,14 +13,14 @@ namespace eight_byte_encoding { * @param timestamp_pattern * @param timestamp_pattern_syntax * @param time_zone_id - * @param reader + * @param ir_buf * @return true on success, false otherwise */ bool encode_preamble( std::string_view timestamp_pattern, std::string_view timestamp_pattern_syntax, std::string_view time_zone_id, - std::vector& reader + std::vector& ir_buf ); /** @@ -28,14 +28,14 @@ namespace eight_byte_encoding { * @param timestamp * @param message * @param logtype - * @param reader + * @param ir_buf * @return true on success, false otherwise */ bool encode_message( epoch_time_ms_t timestamp, std::string_view message, std::string& logtype, - std::vector& reader + std::vector& ir_buf ); } // namespace eight_byte_encoding @@ -46,7 +46,7 @@ namespace four_byte_encoding { * @param timestamp_pattern_syntax * @param time_zone_id * @param reference_timestamp - * @param reader + * @param ir_buf * @return true on success, false otherwise */ bool encode_preamble( @@ -54,7 +54,7 @@ namespace four_byte_encoding { std::string_view timestamp_pattern_syntax, std::string_view time_zone_id, epoch_time_ms_t reference_timestamp, - std::vector& reader + std::vector& ir_buf ); /** @@ -62,35 +62,34 @@ namespace four_byte_encoding { * @param timestamp_delta * @param message * @param logtype - * @param reader + * @param ir_buf * @return true on success, false otherwise */ bool encode_message( epoch_time_ms_t timestamp_delta, std::string_view message, std::string& logtype, - std::vector& reader + std::vector& ir_buf ); /** - * Encodes the given message into the four-byte encoding IR stream - * without encoding timestamp delta + * Encodes the given message into the four-byte encoding IR stream without + * encoding timestamp delta * @param message * @param logtype - * @param reader + * @param ir_buf * @return true on success, false otherwise */ bool - encode_message(std::string_view message, std::string& logtype, std::vector& reader); + encode_message(std::string_view message, std::string& logtype, std::vector& ir_buf); /** - * Encodes the given timestamp delta into the four-byte encoding IR - * stream + * Encodes the given timestamp delta into the four-byte encoding IR stream * @param timestamp_delta - * @param reader + * @param ir_buf * @return true on success, false otherwise */ - bool encode_timestamp(epoch_time_ms_t timestamp_delta, std::vector& reader); + bool encode_timestamp(epoch_time_ms_t timestamp_delta, std::vector& ir_buf); } // namespace four_byte_encoding } // namespace ffi::ir_stream From 03991d732a4942fe8901e74eb8d2bafe48b865c3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 12 Aug 2023 23:46:23 -0400 Subject: [PATCH 086/121] Undo unnecessary changes and clean-up ffi/ir_stream/decoding_methods.* --- .../src/ffi/ir_stream/decoding_methods.cpp | 75 ++++++++++--------- .../src/ffi/ir_stream/decoding_methods.hpp | 58 +++++++------- 2 files changed, 68 insertions(+), 65 deletions(-) diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 8828f58af..7e86df0b3 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -19,97 +19,97 @@ template static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var); /** - * Decodes an integer from reader + * Decodes an integer from the given reader * @tparam integer_t Type of the integer to decode * @param reader * @param value Returns the decoded integer - * @return true on success, false if the reader doesn't contain enough data - * to decode + * @return true on success, false if the reader doesn't contain enough data to + * decode */ template static bool decode_int(ReaderInterface& reader, integer_t& value); /** - * Decodes the next logtype string from reader + * Decodes the next logtype string from the given reader * @param reader * @param encoded_tag * @param logtype Returns the logtype string * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode */ static IRErrorCode parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype); /** - * Decodes the next dictionary-type variable string from reader + * Decodes the next dictionary-type variable string from the given reader * @param reader * @param encoded_tag * @param dict_var Returns the dictionary variable * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough - * data to decode + * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough data + * to decode */ static IRErrorCode parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var); /** - * Parses the next timestamp from reader + * Parses the next timestamp from the given reader * @tparam encoded_variable_t Type of the encoded variable * @param reader * @param encoded_tag * @param ts Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t + * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if + * encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode */ template IRErrorCode parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); /** - * Decodes the next encoded message from reader + * Decodes the next encoded message from the given reader * @tparam encoded_variable_t Type of the encoded variable * @param reader * @param message Returns the decoded message * @param timestamp Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t + * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if + * encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode + * @return IRErrorCode_Decode_Error if the encoded message cannot be properly + * decoded + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode */ template static IRErrorCode generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp); /** - * Reads metadata information from the reader + * Reads metadata information from the given reader * @param reader * @param metadata_type Returns the type of the metadata found in the IR * @param metadata_pos Returns the starting position of the metadata in reader * @param metadata_size Returns the size of the metadata written in the IR * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode */ static IRErrorCode read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size); /** - * Decodes the message from the given logtype, encoded variables, and - * dictionary variables. This function properly handles escaped variable - * placeholders in the logtype, as opposed to ffi::decode_message that - * doesn't handle escaped placeholders for simplicity + * Decodes the message from the given logtype, encoded variables, and dictionary + * variables. This function properly handles escaped variable placeholders in + * the logtype, as opposed to ffi::decode_message that doesn't handle escaped + * placeholders for simplicity * @tparam encoded_variable_t Type of the encoded variable * @param logtype * @param encoded_vars @@ -126,7 +126,11 @@ static string decode_message( template static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) { - static_assert(is_same_v || is_same_v); + static_assert( + (is_same_v + || is_same_v) + ); + if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort || tag == cProtocol::Payload::VarStrLenInt) { @@ -237,7 +241,10 @@ parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& template IRErrorCode parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { - static_assert(is_same_v || is_same_v); + static_assert( + (is_same_v + || is_same_v) + ); if constexpr (is_same_v) { if (cProtocol::Payload::TimestampVal != encoded_tag) { @@ -478,11 +485,7 @@ static string decode_message( } // Add remainder if (next_static_text_begin_pos < logtype.length()) { - message.append( - logtype, - next_static_text_begin_pos, - logtype.length() - next_static_text_begin_pos - ); + message.append(logtype, next_static_text_begin_pos); } return message; @@ -533,7 +536,7 @@ IRErrorCode decode_preamble( encoded_tag_t& metadata_type, std::vector& metadata ) { - uint16_t metadata_size; + uint16_t metadata_size{0}; if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index cd6b409ab..85577b52b 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -20,28 +20,28 @@ typedef enum { /** * Decodes the encoding type for the encoded IR stream - * @param ir_buf + * @param reader * @param is_four_bytes_encoding Returns the encoding type * @return ErrorCode_Success on success - * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data to + * @return ErrorCode_Corrupted_IR if reader contains invalid IR + * @return ErrorCode_Incomplete_IR if reader doesn't contain enough data to * decode */ -IRErrorCode get_encoding_type(ReaderInterface& ir_buf, bool& is_four_bytes_encoding); +IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding); /** * Decodes the preamble for an IR stream. - * @param ir_buf + * @param reader * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in ir_buf + * @param metadata_pos Returns the starting position of the metadata in reader * @param metadata_size Returns the size of the metadata written in the IR * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough - * data to decode + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode */ IRErrorCode decode_preamble( - ReaderInterface& ir_buf, + ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size @@ -49,16 +49,16 @@ IRErrorCode decode_preamble( /** * Decodes the preamble for an IR stream. - * @param ir_buf + * @param reader * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata Returns the metadata as a vector by reference + * @param metadata Returns the metadata in the given vector * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return IRErrorCode_Incomplete_IR if ir_buf doesn't contain enough + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough * data to decode */ IRErrorCode decode_preamble( - ReaderInterface& ir_buf, + ReaderInterface& reader, encoded_tag_t& metadata_type, std::vector& metadata ); @@ -66,37 +66,37 @@ IRErrorCode decode_preamble( namespace eight_byte_encoding { /** * Decodes the next message for the eight-byte encoding IR stream. - * @param ir_buf + * @param reader * @param message Returns the decoded message * @param timestamp Returns the decoded timestamp * @return ErrorCode_Success on success - * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data - * to decode + * @return ErrorCode_Corrupted_IR if reader contains invalid IR + * @return ErrorCode_Decode_Error if the encoded message cannot be properly + * decoded + * @return ErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode * @return ErrorCode_End_of_IR if the IR ends */ IRErrorCode - decode_next_message(ReaderInterface& ir_buf, std::string& message, epoch_time_ms_t& timestamp); + decode_next_message(ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp); } // namespace eight_byte_encoding namespace four_byte_encoding { /** * Decodes the next message for the four-byte encoding IR stream. - * @param ir_buf + * @param reader * @param message Returns the decoded message * @param timestamp_delta Returns the decoded timestamp delta * @return ErrorCode_Success on success - * @return ErrorCode_Corrupted_IR if ir_buf contains invalid IR - * @return ErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return ErrorCode_Incomplete_IR if ir_buf doesn't contain enough data - * to decode + * @return ErrorCode_Corrupted_IR if reader contains invalid IR + * @return ErrorCode_Decode_Error if the encoded message cannot be properly + * decoded + * @return ErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode * @return ErrorCode_End_of_IR if the IR ends */ IRErrorCode decode_next_message( - ReaderInterface& ir_buf, + ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp_delta ); From 7bc93c98bc51233e228243b9e920b3f7528e6e04 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Sun, 13 Aug 2023 22:54:55 -0400 Subject: [PATCH 087/121] Fixes --- .../core/src/EncodedVariableInterpreter.hpp | 1 + components/core/src/IrMessageParser.cpp | 130 +++++++++++------- components/core/src/IrMessageParser.hpp | 27 ++-- .../src/ffi/ir_stream/decoding_methods.cpp | 36 +++-- .../src/ffi/ir_stream/decoding_methods.hpp | 9 +- .../src/ffi/ir_stream/decoding_methods.tpp | 8 +- 6 files changed, 133 insertions(+), 78 deletions(-) diff --git a/components/core/src/EncodedVariableInterpreter.hpp b/components/core/src/EncodedVariableInterpreter.hpp index 586229f1d..115120f42 100644 --- a/components/core/src/EncodedVariableInterpreter.hpp +++ b/components/core/src/EncodedVariableInterpreter.hpp @@ -62,6 +62,7 @@ class EncodedVariableInterpreter { * @param value */ static void convert_encoded_float_to_string (encoded_variable_t encoded_var, std::string& value); + /** * Converts the four bytes encoded float to eight byte encoded float * @param four_bytes_float diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp index d3872e57d..e8f0fa2c9 100644 --- a/components/core/src/IrMessageParser.cpp +++ b/components/core/src/IrMessageParser.cpp @@ -13,8 +13,8 @@ #include "../../../submodules/json/single_include/nlohmann/json.hpp" using ffi::cVariablePlaceholderEscapeCharacter; -using ffi::four_byte_encoded_variable_t; using ffi::eight_byte_encoded_variable_t; +using ffi::four_byte_encoded_variable_t; using ffi::ir_stream::cProtocol::MagicNumberLength; using ffi::ir_stream::IRErrorCode; using ffi::VariablePlaceholder; @@ -28,7 +28,8 @@ namespace { * @param is_four_bytes_encoded Returns the encoding type * or Ir header that can't be properly decoded */ -[[nodiscard]] auto decode_ir_magic_number(ReaderInterface& reader, bool& is_four_bytes_encoded) -> bool { +[[nodiscard]] auto decode_ir_magic_number(ReaderInterface& reader, bool& is_four_bytes_encoded) + -> bool { // Note. On failure, this method doesn't recover file pos. if (ffi::ir_stream::IRErrorCode_Success != ffi::ir_stream::get_encoding_type(reader, is_four_bytes_encoded)) @@ -46,7 +47,7 @@ namespace { * @throw OperationFailed if the reader doesn't contain IR encoded data, * or IR data that can't be properly decoded */ -IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { +IrMessageParser::IrMessageParser(ReaderInterface& reader) : m_reader(reader) { if (false == decode_ir_magic_number(m_reader, m_is_four_bytes_encoded)) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } @@ -56,7 +57,7 @@ IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - const string mocked_ts_pattern = "%Y-%m-%dT%H:%M:%S.%3"; + string const mocked_ts_pattern = "%Y-%m-%dT%H:%M:%S.%3"; try { auto metadata_json = nlohmann::json::parse(json_metadata); string version = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey); @@ -69,26 +70,28 @@ IrMessageParser::IrMessageParser (ReaderInterface& reader) : m_reader(reader) { m_ts_pattern = TimestampPattern(0, mocked_ts_pattern); if (m_is_four_bytes_encoded) { - m_reference_timestamp = std::stoll(metadata_json.at( - ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey).get()); + m_reference_timestamp = std::stoll( + metadata_json.at(ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey) + .get() + ); m_msg.set_ts(m_reference_timestamp); } - } catch (const nlohmann::json::parse_error& e) { + } catch (nlohmann::json::parse_error const& e) { SPDLOG_ERROR("Failed to parse json metadata from reader"); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_msg.set_ts_pattern(&m_ts_pattern); } -auto IrMessageParser::parse_next_encoded_message () -> bool { +auto IrMessageParser::parse_next_encoded_message() -> bool { if (m_is_four_bytes_encoded) { return parse_next_four_bytes_message(); } return parse_next_eight_bytes_message(); } -auto IrMessageParser::parse_next_eight_bytes_message () -> bool { +auto IrMessageParser::parse_next_eight_bytes_message() -> bool { m_msg.clear(); epochtime_t ts{0}; @@ -96,9 +99,8 @@ auto IrMessageParser::parse_next_eight_bytes_message () -> bool { vector dict_vars; string logtype; - auto error_code = ffi::ir_stream::generic_parse_tokens( - m_reader, logtype, encoded_vars, dict_vars, ts - ); + auto error_code + = ffi::ir_stream::generic_parse_tokens(m_reader, logtype, encoded_vars, dict_vars, ts); if (IRErrorCode::IRErrorCode_Success != error_code) { if (IRErrorCode::IRErrorCode_Eof != error_code) { @@ -107,30 +109,40 @@ auto IrMessageParser::parse_next_eight_bytes_message () -> bool { return false; } - auto constant_handler = [this] (const std::string& value, size_t begin_pos, size_t length) { + auto constant_handler = [this](std::string const& value, size_t begin_pos, size_t length) { m_msg.append_to_logtype(value, begin_pos, length); }; - auto encoded_int_handler = [this] (eight_byte_encoded_variable_t value) { + auto constant_remainder_handler = [this](std::string const& value, size_t begin_pos) { + auto const remaining_size = value.length() - begin_pos; + m_msg.append_to_logtype(value, begin_pos, remaining_size); + }; + + auto encoded_int_handler = [this](eight_byte_encoded_variable_t value) { auto decoded_int = ffi::decode_integer_var(value); m_msg.add_encoded_integer(value, decoded_int.length()); }; - auto encoded_float_handler = [this] (eight_byte_encoded_variable_t encoded_float) { + auto encoded_float_handler = [this](eight_byte_encoded_variable_t encoded_float) { auto decoded_float = ffi::decode_float_var(encoded_float); m_msg.add_encoded_float(encoded_float, decoded_float.size()); }; - auto dict_var_handler = [this] (const string& dict_var) { - m_msg.add_dictionary_var(dict_var); - }; + auto dict_var_handler = [this](string const& dict_var) { m_msg.add_dictionary_var(dict_var); }; // handle timestamp m_msg.set_ts(ts); try { - ffi::ir_stream::generic_decode_message(logtype, encoded_vars, dict_vars, - constant_handler, encoded_int_handler, - encoded_float_handler, dict_var_handler); + ffi::ir_stream::generic_decode_message( + logtype, + encoded_vars, + dict_vars, + constant_handler, + constant_remainder_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler + ); } catch (ffi::ir_stream::DecodingException& e) { SPDLOG_ERROR("Decoding failed with exception {}", e.what()); return false; @@ -139,7 +151,7 @@ auto IrMessageParser::parse_next_eight_bytes_message () -> bool { return true; } -auto IrMessageParser::parse_next_four_bytes_message () -> bool { +auto IrMessageParser::parse_next_four_bytes_message() -> bool { m_msg.clear(); epochtime_t ts{0}; @@ -147,9 +159,8 @@ auto IrMessageParser::parse_next_four_bytes_message () -> bool { vector dict_vars; string logtype; - auto error_code = ffi::ir_stream::generic_parse_tokens( - m_reader, logtype, encoded_vars, dict_vars, ts - ); + auto error_code + = ffi::ir_stream::generic_parse_tokens(m_reader, logtype, encoded_vars, dict_vars, ts); if (IRErrorCode::IRErrorCode_Success != error_code) { if (IRErrorCode::IRErrorCode_Eof != error_code) { @@ -158,28 +169,44 @@ auto IrMessageParser::parse_next_four_bytes_message () -> bool { return false; } - auto constant_handler = [this] (const std::string& value, size_t begin_pos, size_t length) { + auto constant_handler = [this](std::string const& value, size_t begin_pos, size_t length) { m_msg.append_to_logtype(value, begin_pos, length); }; - auto encoded_int_handler = [this] (four_byte_encoded_variable_t value) { + auto constant_remainder_handler = [this](std::string const& value, size_t begin_pos) { + auto const remaining_size = value.length() - begin_pos; + m_msg.append_to_logtype(value, begin_pos, remaining_size); + }; + + auto encoded_int_handler = [this](four_byte_encoded_variable_t value) { // assume that we need the actual size auto decoded_int = ffi::decode_integer_var(value); m_msg.add_encoded_integer(value, decoded_int.length()); }; - auto encoded_float_handler = [this] (four_byte_encoded_variable_t encoded_float) { + auto encoded_float_handler = [this](four_byte_encoded_variable_t encoded_float) { const auto original_size_in_bytes = ffi::decode_float_var(encoded_float).size(); - eight_byte_encoded_variable_t converted_float {0}; - EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte(encoded_float, converted_float); + eight_byte_encoded_variable_t converted_float{0}; + EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte( + encoded_float, + converted_float + ); m_msg.add_encoded_float(converted_float, original_size_in_bytes); }; - auto dict_var_handler = [this] (const string& dict_var) { + auto dict_var_handler = [this](string const& dict_var) { encoded_variable_t converted_var{0}; - if (EncodedVariableInterpreter::convert_string_to_representable_integer_var(dict_var, converted_var)) { + if (EncodedVariableInterpreter::convert_string_to_representable_integer_var( + dict_var, + converted_var + )) + { m_msg.add_encoded_integer(converted_var, dict_var.size()); - } else if (EncodedVariableInterpreter::convert_string_to_representable_float_var(dict_var, converted_var)) { + } else if (EncodedVariableInterpreter::convert_string_to_representable_float_var( + dict_var, + converted_var + )) + { m_msg.add_encoded_float(converted_var, dict_var.size()); } else { m_msg.add_dictionary_var(dict_var); @@ -190,9 +217,16 @@ auto IrMessageParser::parse_next_four_bytes_message () -> bool { m_reference_timestamp += ts; m_msg.set_ts(m_reference_timestamp); try { - ffi::ir_stream::generic_decode_message(logtype, encoded_vars, dict_vars, - constant_handler, encoded_int_handler, - encoded_float_handler, dict_var_handler); + ffi::ir_stream::generic_decode_message( + logtype, + encoded_vars, + dict_vars, + constant_handler, + constant_remainder_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler + ); } catch (ffi::ir_stream::DecodingException& e) { SPDLOG_ERROR("Decoding failed with exception {}", e.what()); return false; @@ -201,13 +235,14 @@ auto IrMessageParser::parse_next_four_bytes_message () -> bool { return true; } -auto IrMessageParser::decode_json_preamble (std::string& json_metadata) -> bool { +auto IrMessageParser::decode_json_preamble(std::string& json_metadata) -> bool { // Decode and parse metadata ffi::ir_stream::encoded_tag_t metadata_type{0}; std::vector metadata_vec; - if (ffi::ir_stream::IRErrorCode_Success != - ffi::ir_stream::decode_preamble(m_reader, metadata_type, metadata_vec)) { + if (ffi::ir_stream::IRErrorCode_Success + != ffi::ir_stream::decode_preamble(m_reader, metadata_type, metadata_vec)) + { SPDLOG_ERROR("Failed to parse metadata"); return false; } @@ -217,21 +252,24 @@ auto IrMessageParser::decode_json_preamble (std::string& json_metadata) -> bool return false; } - json_metadata.assign(size_checked_pointer_cast(metadata_vec.data()), - metadata_vec.size()); + json_metadata.assign( + size_checked_pointer_cast(metadata_vec.data()), + metadata_vec.size() + ); return true; } -auto IrMessageParser::is_ir_encoded (size_t sequence_length, const char* data) -> bool { +auto IrMessageParser::is_ir_encoded(size_t sequence_length, char const* data) -> bool { if (sequence_length < MagicNumberLength) { return false; } bool is_four_bytes_encoded{false}; - BufferReader encoding_data (data, MagicNumberLength); - if (ffi::ir_stream::IRErrorCode_Success != - ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) { + BufferReader encoding_data(data, MagicNumberLength); + if (ffi::ir_stream::IRErrorCode_Success + != ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) + { return false; } return true; -} \ No newline at end of file +} diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp index 392137fef..4809ea8ca 100644 --- a/components/core/src/IrMessageParser.hpp +++ b/components/core/src/IrMessageParser.hpp @@ -7,9 +7,9 @@ // C++ standard libraries // Project headers -#include "TraceableException.hpp" #include "ffi/ir_stream/decoding_methods.hpp" #include "ParsedIrMessage.hpp" +#include "TraceableException.hpp" /* * Class representing the parser that parses messages from encoded IR and @@ -21,29 +21,32 @@ class IrMessageParser { class OperationFailed : public TraceableException { public: // Constructors - OperationFailed (ErrorCode error_code, const char* const filename, int line_number) : - TraceableException (error_code, filename, line_number) {} + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} // Methods - [[nodiscard]] auto what () const noexcept -> const char* override { + [[nodiscard]] auto what() const noexcept -> char const* override { return "IrMessageParser operation failed"; } }; + // Constructor - explicit IrMessageParser (ReaderInterface& reader); + explicit IrMessageParser(ReaderInterface& reader); // Methods - auto get_ts_pattern () -> TimestampPattern* { return &m_ts_pattern; } - [[nodiscard]] auto get_parsed_msg () const -> const ParsedIrMessage& { return m_msg; } + auto get_ts_pattern() -> TimestampPattern* { return &m_ts_pattern; } + + [[nodiscard]] auto get_parsed_msg() const -> ParsedIrMessage const& { return m_msg; } + auto get_msg_logtype_entry() -> LogTypeDictionaryEntry& { return m_msg.get_logtype_entry(); } - [[nodiscard]] auto parse_next_encoded_message () -> bool; - static bool is_ir_encoded (size_t sequence_length, const char* data); -private: + [[nodiscard]] auto parse_next_encoded_message() -> bool; + static bool is_ir_encoded(size_t sequence_length, char const* data); +private: [[nodiscard]] auto parse_next_four_bytes_message() -> bool; [[nodiscard]] auto parse_next_eight_bytes_message() -> bool; - [[nodiscard]] auto decode_json_preamble (std::string& json_metadata) -> bool; + [[nodiscard]] auto decode_json_preamble(std::string& json_metadata) -> bool; // member variables bool m_is_four_bytes_encoded; @@ -53,4 +56,4 @@ class IrMessageParser { ReaderInterface& m_reader; }; -#endif // IrMessageParser_HPP \ No newline at end of file +#endif // IrMessageParser_HPP diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 94061eae3..54446ec5d 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -159,17 +159,20 @@ namespace { * @param reader * @param encoded_tag * @param ts Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if - * encoded_variable_t == eight_byte_encoded_variable_t + * encoded_variable_t == four_byte_encoded_variable_t or the actual + * timestamp if encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to - * decode + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode */ template IRErrorCode parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { - static_assert(is_same_v || is_same_v); + static_assert( + (is_same_v + || is_same_v) + ); if constexpr (is_same_v) { if (cProtocol::Payload::TimestampVal != encoded_tag) { @@ -210,14 +213,14 @@ namespace { * @param reader * @param message Returns the decoded message * @param timestamp Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if - * encoded_variable_t == eight_byte_encoded_variable_t + * encoded_variable_t == four_byte_encoded_variable_t or the actual + * timestamp if encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Decode_Error if the encoded message cannot be properly - * decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to - * decode + * @return IRErrorCode_Decode_Error if the encoded message cannot be + * properly decoded + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * to decode */ template IRErrorCode generic_decode_next_message( @@ -242,6 +245,11 @@ namespace { message.append(value, begin_pos, length); }; + // constant handler + auto constant_remainder_handler = [&message](string const& value, size_t begin_pos) { + message.append(value, begin_pos); + }; + // encoded int handler auto encoded_int_handler = [&message](encoded_variable_t value) { message.append(decode_integer_var(value)); @@ -261,6 +269,7 @@ namespace { encoded_vars, dict_vars, constant_handler, + constant_remainder_handler, encoded_int_handler, encoded_float_handler, dict_var_handler @@ -272,10 +281,11 @@ namespace { } /** - * Reads metadata information from the reader + * Reads metadata information from the given reader * @param reader * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in reader + * @param metadata_pos Returns the starting position of the metadata in + * reader * @param metadata_size Returns the size of the metadata written in the IR * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 7c5028f6e..3b71d5f1b 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -75,8 +75,10 @@ IRErrorCode generic_parse_tokens( * Decodes the message consists of the tokens and calls the given methods * to handle specific components of the message. * @tparam encoded_variable_t Type of the encoded variable - * @tparam ConstantHandler Method to handle constants. Signature: - * (const std::string&, size_t, size_t) -> void + * @tparam ConstantHandler Method to handle constants in the logtypes. + * Signature: (const std::string&, size_t, size_t) -> void + * @tparam ConstantRemainderHandler Method to handle remaining constant in the + * logtypes. Signature: (const std::string&, size_t, size_t) -> void * @tparam EncodedIntHandler Method to handle encoded integers. * Signature: (encoded_variable_t) -> void * @tparam EncodedFloatHandler Method to handle encoded float. @@ -87,6 +89,7 @@ IRErrorCode generic_parse_tokens( * @param encoded_vars * @param dict_vars * @param constant_handler + * @param constant_remainder_handler * @param encoded_int_handler * @param encoded_float_handler * @param dict_var_handler @@ -95,6 +98,7 @@ IRErrorCode generic_parse_tokens( template < typename encoded_variable_t, typename ConstantHandler, + typename ConstantRemainderHandler, typename EncodedIntHandler, typename EncodedFloatHandler, typename DictVarHandler> @@ -103,6 +107,7 @@ void generic_decode_message( std::vector const& encoded_vars, std::vector const& dict_vars, ConstantHandler constant_handler, + ConstantRemainderHandler constant_remainder_handler, EncodedIntHandler encoded_int_handler, EncodedFloatHandler encoded_float_handler, DictVarHandler dict_var_handler diff --git a/components/core/src/ffi/ir_stream/decoding_methods.tpp b/components/core/src/ffi/ir_stream/decoding_methods.tpp index 52678e533..3f664ec17 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.tpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.tpp @@ -15,6 +15,7 @@ namespace ffi::ir_stream { template < typename encoded_variable_t, typename ConstantHandler, + typename ConstantRemainderHandler, typename EncodedIntHandler, typename EncodedFloatHandler, typename DictVarHandler> @@ -23,6 +24,7 @@ void generic_decode_message( std::vector const& encoded_vars, std::vector const& dict_vars, ConstantHandler constant_handler, + ConstantRemainderHandler constant_remainder_handler, EncodedIntHandler encoded_int_handler, EncodedFloatHandler encoded_float_handler, DictVarHandler dict_var_handler @@ -132,11 +134,7 @@ void generic_decode_message( } // Add remainder if (next_static_text_begin_pos < logtype_length) { - constant_handler( - logtype, - next_static_text_begin_pos, - logtype_length - next_static_text_begin_pos - ); + constant_remainder_handler(logtype, next_static_text_begin_pos); } } } // namespace ffi::ir_stream From c1efd29913e3bc819c44fa656cddeaf58cd7c36a Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 00:33:04 -0400 Subject: [PATCH 088/121] BufferedFileReader: Combine OperationFailed and OperationFailedWithMsg. --- components/core/src/BufferedFileReader.cpp | 2 +- components/core/src/BufferedFileReader.hpp | 20 ++++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index b8f6e3d85..a4cfed97e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -217,7 +217,7 @@ void BufferedFileReader::open(string const& path) { auto const error_code = try_open(path); if (ErrorCode_Success != error_code) { if (ErrorCode_FileNotFound == error_code) { - throw OperationFailedWithMsg( + throw OperationFailed( error_code, __FILENAME__, __LINE__, diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 3c9993282..3b060c548 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -29,18 +29,14 @@ class BufferedFileReader : public ReaderInterface { public: // Constructors OperationFailed(ErrorCode error_code, char const* const filename, int line_number) - : TraceableException(error_code, filename, line_number) {} + : OperationFailed( + error_code, + filename, + line_number, + "BufferedFileReader operation failed" + ) {} - // Methods - [[nodiscard]] auto what() const noexcept -> char const* override { - return "BufferedFileReader operation failed"; - } - }; - - class OperationFailedWithMsg : public TraceableException { - public: - // Constructors - OperationFailedWithMsg( + OperationFailed( ErrorCode error_code, char const* const filename, int line_number, @@ -51,7 +47,7 @@ class BufferedFileReader : public ReaderInterface { // Methods [[nodiscard]] auto what() const noexcept -> char const* override { - return "BufferedFileReader operation failed"; + return m_message.c_str(); } private: From cf3c6e3f4b8502f833b9f4c337d0ea052494597a Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 01:25:57 -0400 Subject: [PATCH 089/121] BufferedFileReader: Replace quantize_to_buffer_size with general method. --- components/core/CMakeLists.txt | 3 +++ components/core/src/BufferedFileReader.cpp | 12 +++--------- components/core/src/BufferedFileReader.hpp | 7 ------- components/core/src/math_utils.hpp | 21 +++++++++++++++++++++ components/core/tests/test-math_utils.cpp | 22 ++++++++++++++++++++++ 5 files changed, 49 insertions(+), 16 deletions(-) create mode 100644 components/core/src/math_utils.hpp create mode 100644 components/core/tests/test-math_utils.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 1717a7a0d..14c0956fd 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -249,6 +249,7 @@ set(SOURCE_FILES_clp src/LogTypeDictionaryReader.hpp src/LogTypeDictionaryWriter.cpp src/LogTypeDictionaryWriter.hpp + src/math_utils.hpp src/MessageParser.cpp src/MessageParser.hpp src/MySQLDB.cpp @@ -766,6 +767,7 @@ set(SOURCE_FILES_unitTest src/LogTypeDictionaryReader.hpp src/LogTypeDictionaryWriter.cpp src/LogTypeDictionaryWriter.hpp + src/math_utils.hpp src/MessageParser.cpp src/MessageParser.hpp src/MySQLDB.cpp @@ -855,6 +857,7 @@ set(SOURCE_FILES_unitTest tests/test-Grep.cpp tests/test-ir_encoding_methods.cpp tests/test-main.cpp + tests/test-math_utils.cpp tests/test-ParserWithUserSchema.cpp tests/test-query_methods.cpp tests/test-Segment.cpp diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index a4cfed97e..4100cde98 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -9,6 +9,8 @@ // C++ standard libraries #include +#include "math_utils.hpp" + using std::string; namespace { @@ -284,13 +286,6 @@ auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) return ErrorCode_Success; } -auto BufferedFileReader::quantize_to_buffer_size(size_t size) const -> size_t { - if (size == 0) { - return 0; - } - return (1 + ((size - 1) / m_base_buffer_size)) * m_base_buffer_size; -} - auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> ErrorCode { size_t num_bytes_refilled = 0; @@ -340,8 +335,7 @@ auto BufferedFileReader::resize_buffer_from_pos(size_t pos) -> void { } auto const new_data_size = m_buffer_reader->get_buffer_size() - pos; - // Use a quantized size for the underlying buffer size - auto const buffer_size = quantize_to_buffer_size(new_data_size); + auto const buffer_size = int_round_up_to_multiple(new_data_size, m_base_buffer_size); m_buffer.erase(m_buffer.begin(), m_buffer.begin() + static_cast(pos)); m_buffer.resize(buffer_size); diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 3b060c548..7d72cf005 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -191,13 +191,6 @@ class BufferedFileReader : public ReaderInterface { private: // Methods - /** - * Quantize the given size to be the next integer multiple of buffer_size - * @param size - * @return quantized size - */ - [[nodiscard]] auto quantize_to_buffer_size(size_t size) const -> size_t; - /** * Reads next refill_size bytes from file descriptor to the internal buffer * and sets the data size of the internal buffer diff --git a/components/core/src/math_utils.hpp b/components/core/src/math_utils.hpp new file mode 100644 index 000000000..847785e89 --- /dev/null +++ b/components/core/src/math_utils.hpp @@ -0,0 +1,21 @@ +#ifndef MATH_UTILS_HPP +#define MATH_UTILS_HPP + +#include + +/** + * @tparam unsigned_t An unsigned integer type + * @param val + * @param factor Factor for the multiple. Cannot be 0. + * @return The given value rounded up to the nearest multiple of the given + * factor + */ +template +auto int_round_up_to_multiple(unsigned_t val, unsigned_t factor) -> unsigned_t { + static_assert(std::is_unsigned_v); + // NOTE: "val + multiple" could overflow, but the "- 1" will undo the + // overflow since overflow semantics are well-defined for unsigned integers. + return ((val + factor - 1) / factor) * factor; +} + +#endif // MATH_UTILS_HPP diff --git a/components/core/tests/test-math_utils.cpp b/components/core/tests/test-math_utils.cpp new file mode 100644 index 000000000..fca9b8685 --- /dev/null +++ b/components/core/tests/test-math_utils.cpp @@ -0,0 +1,22 @@ +#include + +#include "../src/math_utils.hpp" + +TEST_CASE("int_round_up_to_multiple", "[math_utils]") { + // Factor of 1 + CHECK(int_round_up_to_multiple(0U, 1U) == 0); + CHECK(int_round_up_to_multiple(1U, 1U) == 1); + CHECK(int_round_up_to_multiple(2U, 1U) == 2); + + // Factor of 10 + CHECK(int_round_up_to_multiple(0U, 10U) == 0); + CHECK(int_round_up_to_multiple(1U, 10U) == 10); + CHECK(int_round_up_to_multiple(10U, 10U) == 10); + CHECK(int_round_up_to_multiple(11U, 10U) == 20); + + // Test value and factor which could overflow + // Round up (2^64 / 2) to the nearest multiple of (2^64 / 2) + uint64_t const factor = (1ULL << 63); + uint64_t const val = (1ULL << 63); + CHECK(int_round_up_to_multiple(val, factor) == val); +} From e5bc6432cf79f9ded0800adb0488759a366927fa Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 01:31:05 -0400 Subject: [PATCH 090/121] Remove include-grouping comments since clang-format handles it. --- components/core/src/BufferedFileReader.cpp | 7 ++----- components/core/src/BufferedFileReader.hpp | 2 -- components/core/tests/test-BufferedFileReader.cpp | 6 +----- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 4100cde98..5dffd8496 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -1,14 +1,11 @@ #include "BufferedFileReader.hpp" -// Boost libraries -#include - -// C standard libraries #include -// C++ standard libraries #include +#include + #include "math_utils.hpp" using std::string; diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 7d72cf005..9573d5d3f 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -1,14 +1,12 @@ #ifndef BUFFEREDFILEREADER_HPP #define BUFFEREDFILEREADER_HPP -// C++ libraries #include #include #include #include #include -// Project headers #include "BufferReader.hpp" #include "Defs.h" #include "ErrorCode.hpp" diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index eb568c10c..40fc9156e 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -1,10 +1,6 @@ -// Boost libraries #include +#include -// Catch2 -#include "../submodules/Catch2/single_include/catch2/catch.hpp" - -// Project headers #include "../src/BufferedFileReader.hpp" #include "../src/FileWriter.hpp" From c8a99819c49cca5292f5984e09512445f1a8f485 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 02:06:41 -0400 Subject: [PATCH 091/121] BufferedFileReader: Reset buffer reader even on EOF. --- components/core/src/BufferedFileReader.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 5dffd8496..93af7ca54 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -318,12 +318,13 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err bytes_to_read, num_bytes_refilled ); - if (error_code != ErrorCode_Success) { + if (error_code != ErrorCode_Success && ErrorCode_EndOfFile != error_code) { return error_code; } + // NOTE: We still want to set the buffer reader if no bytes were read on EOF m_buffer_reader .emplace(m_buffer.data(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); - return ErrorCode_Success; + return error_code; } auto BufferedFileReader::resize_buffer_from_pos(size_t pos) -> void { From f28dea965bab1281326889f9ac752d7536761cce Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 02:38:18 -0400 Subject: [PATCH 092/121] BufferedFileReader: Reduce indentation by rewriting branches. --- components/core/src/BufferedFileReader.cpp | 60 +++++++++++----------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 93af7ca54..82542c51a 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -16,18 +16,19 @@ auto try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& n num_bytes_read = 0; while (true) { auto const bytes_read = ::read(fd, buf, num_bytes_to_read); - if (bytes_read > 0) { - buf += bytes_read; - num_bytes_read += bytes_read; - num_bytes_to_read -= bytes_read; - if (num_bytes_read == num_bytes_to_read) { - return ErrorCode_Success; - } - } else if (0 == bytes_read) { + if (0 == bytes_read) { break; - } else { + } + if (bytes_read < 0) { return ErrorCode_errno; } + + buf += bytes_read; + num_bytes_read += bytes_read; + num_bytes_to_read -= bytes_read; + if (num_bytes_read == num_bytes_to_read) { + return ErrorCode_Success; + } } if (0 == num_bytes_read) { return ErrorCode_EndOfFile; @@ -64,11 +65,7 @@ auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { return ErrorCode_Success; } - size_t seek_lower_bound = m_file_pos; - if (m_checkpoint_pos.has_value()) { - seek_lower_bound = m_checkpoint_pos.value(); - } - + auto seek_lower_bound = m_checkpoint_pos.has_value() ? m_checkpoint_pos.value() : m_file_pos; if (pos < seek_lower_bound) { return ErrorCode_Failure; } @@ -161,7 +158,7 @@ auto BufferedFileReader::try_read_to_delimiter( } bool found_delim{false}; size_t total_num_bytes_read{0}; - while (false == found_delim) { + while (true) { size_t num_bytes_read{0}; if (auto ret_code = m_buffer_reader->try_read_to_delimiter( delim, @@ -176,18 +173,19 @@ auto BufferedFileReader::try_read_to_delimiter( } update_file_pos(m_file_pos + num_bytes_read); total_num_bytes_read += num_bytes_read; + if (found_delim) { + break; + } - if (false == found_delim) { - auto error_code = refill_reader_buffer(m_base_buffer_size); - if (ErrorCode_EndOfFile == error_code) { - if (total_num_bytes_read == 0) { - return ErrorCode_EndOfFile; - } - return ErrorCode_Success; - } - if (ErrorCode_Success != error_code) { - return error_code; + auto error_code = refill_reader_buffer(m_base_buffer_size); + if (ErrorCode_EndOfFile == error_code) { + if (total_num_bytes_read == 0) { + return ErrorCode_EndOfFile; } + break; + } + if (ErrorCode_Success != error_code) { + return error_code; } } return ErrorCode_Success; @@ -245,12 +243,12 @@ auto BufferedFileReader::close() -> ErrorCode { } auto BufferedFileReader::set_checkpoint() -> size_t { - if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos) { - if (m_buffer_reader->get_buffer_size() != m_base_buffer_size) { - // allocate new buffer for buffered data starting from pos - resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(get_buffer_relative_pos(m_file_pos)); - } + if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos + && m_buffer_reader->get_buffer_size() != m_base_buffer_size) + { + // allocate new buffer for buffered data starting from pos + resize_buffer_from_pos(m_buffer_reader->get_pos()); + m_buffer_reader->seek_from_begin(get_buffer_relative_pos(m_file_pos)); } m_checkpoint_pos = m_file_pos; return m_file_pos; From 8051ed210a21ffb50102e3eb8833cd4f31b36fb7 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 02:41:41 -0400 Subject: [PATCH 093/121] BufferedFileReader: Return appropriate error code when trying to seek to an unsupported position. --- components/core/src/BufferedFileReader.cpp | 2 +- components/core/tests/test-BufferedFileReader.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 82542c51a..17faabf2e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -67,7 +67,7 @@ auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { auto seek_lower_bound = m_checkpoint_pos.has_value() ? m_checkpoint_pos.value() : m_file_pos; if (pos < seek_lower_bound) { - return ErrorCode_Failure; + return ErrorCode_Unsupported; } auto error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 40fc9156e..707a9a1fc 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -82,7 +82,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos2, num_bytes_to_read)); // the seek should fail on a backward seek when checkpoint is not enabled - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos2)); + REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(seek_pos2)); } SECTION("seek with checkpoint") { @@ -118,7 +118,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); // now try to seek back to a pos that's before the checkpoint - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(checkpoint_pos - 1)); + REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(checkpoint_pos - 1)); // now go back to latest data REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); @@ -135,7 +135,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { file_reader.seek_from_begin((latest_file_pos + checkpoint_pos) / 2); file_reader.set_checkpoint(); // the previous seek_pos should be unavailable - REQUIRE(ErrorCode_Failure == file_reader.try_seek_from_begin(seek_pos_1)); + REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(seek_pos_1)); // make sure data read after checkpoint-set are still correct size_t num_bytes_to_read_4 = 4096; From 3aaf117d5760e0bf22378e75753eec96b59fbaae Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 03:15:27 -0400 Subject: [PATCH 094/121] BufferedFileReader: Refactor refill_reader_buffer --- components/core/src/BufferedFileReader.cpp | 46 +++++++++++----------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 17faabf2e..695901b9b 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -282,46 +282,48 @@ auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) } auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> ErrorCode { - size_t num_bytes_refilled = 0; - auto const buffer_end_pos = get_buffer_end_pos(); auto const data_size = m_buffer_reader->get_buffer_size(); auto const available_buffer_space = m_buffer.size() - data_size; - size_t buf_internal_pos{0}; - size_t bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); + size_t num_bytes_to_read{0}; + size_t next_buffer_pos{0}; + auto next_buffer_begin_pos = m_buffer_begin_pos; if (m_checkpoint_pos.has_value()) { - while (bytes_to_read < num_bytes_to_refill) { - bytes_to_read += m_base_buffer_size; - } - // Grow the buffer if bytes_to_read is more - // than available space in the buffer - if (bytes_to_read > available_buffer_space) { - m_buffer.resize(data_size + bytes_to_read); + num_bytes_to_read = int_round_up_to_multiple( + buffer_end_pos + num_bytes_to_refill, + m_base_buffer_size + ); + // Grow the buffer if necessary + if (num_bytes_to_read > available_buffer_space) { + m_buffer.resize(data_size + num_bytes_to_read); } - buf_internal_pos = data_size; + next_buffer_pos = data_size; } else { - if (bytes_to_read > available_buffer_space) { - // advance the entire buffer - buf_internal_pos = 0; - m_buffer_begin_pos = buffer_end_pos; + num_bytes_to_read = m_base_buffer_size - (buffer_end_pos % m_base_buffer_size); + if (num_bytes_to_read > available_buffer_space) { + // Advance the entire buffer since we don't grow the buffer if + // there's no checkpoint + next_buffer_pos = 0; + next_buffer_begin_pos = buffer_end_pos; } else { - buf_internal_pos = data_size; + next_buffer_pos = data_size; } } + size_t num_bytes_read{0}; auto error_code = try_read_into_buffer( m_fd, - &m_buffer[buf_internal_pos], - bytes_to_read, - num_bytes_refilled + &m_buffer[next_buffer_pos], + num_bytes_to_read, + num_bytes_read ); if (error_code != ErrorCode_Success && ErrorCode_EndOfFile != error_code) { return error_code; } // NOTE: We still want to set the buffer reader if no bytes were read on EOF - m_buffer_reader - .emplace(m_buffer.data(), num_bytes_refilled + buf_internal_pos, buf_internal_pos); + m_buffer_reader.emplace(m_buffer.data(), next_buffer_pos + num_bytes_read, next_buffer_pos); + m_buffer_begin_pos = next_buffer_begin_pos; return error_code; } From 9f978352404eaa411ccb81688d50161eed0061bb Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 03:35:24 -0400 Subject: [PATCH 095/121] BufferedFileReader: Refactor resize_buffer_from_pos --- components/core/src/BufferedFileReader.cpp | 29 +++++++++++----------- components/core/src/BufferedFileReader.hpp | 7 +++--- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 695901b9b..7a94f036c 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -246,9 +246,7 @@ auto BufferedFileReader::set_checkpoint() -> size_t { if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos && m_buffer_reader->get_buffer_size() != m_base_buffer_size) { - // allocate new buffer for buffered data starting from pos - resize_buffer_from_pos(m_buffer_reader->get_pos()); - m_buffer_reader->seek_from_begin(get_buffer_relative_pos(m_file_pos)); + drop_content_before_current_pos(); } m_checkpoint_pos = m_file_pos; return m_file_pos; @@ -259,8 +257,12 @@ auto BufferedFileReader::clear_checkpoint() -> void { return; } - m_file_pos = m_highest_read_pos; - resize_buffer_from_pos(get_buffer_relative_pos(m_file_pos)); + auto error_code = try_seek_from_begin(m_highest_read_pos); + if (ErrorCode_Success != error_code) { + // Should never happen + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + drop_content_before_current_pos(); m_checkpoint_pos.reset(); } @@ -327,17 +329,14 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err return error_code; } -auto BufferedFileReader::resize_buffer_from_pos(size_t pos) -> void { - if (pos > m_buffer_reader->get_buffer_size()) { - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); - } - - auto const new_data_size = m_buffer_reader->get_buffer_size() - pos; - auto const buffer_size = int_round_up_to_multiple(new_data_size, m_base_buffer_size); +auto BufferedFileReader::drop_content_before_current_pos() -> void { + auto buffer_reader_pos = m_buffer_reader->get_pos(); + auto const new_data_size = m_buffer_reader->get_buffer_size() - buffer_reader_pos; + auto const new_buffer_size = int_round_up_to_multiple(new_data_size, m_base_buffer_size); - m_buffer.erase(m_buffer.begin(), m_buffer.begin() + static_cast(pos)); - m_buffer.resize(buffer_size); - m_buffer_begin_pos += pos; + m_buffer.erase(m_buffer.begin(), m_buffer.begin() + static_cast(buffer_reader_pos)); + m_buffer.resize(new_buffer_size); + m_buffer_begin_pos += buffer_reader_pos; m_buffer_reader.emplace(m_buffer.data(), new_data_size); } diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 9573d5d3f..9fa21fa3d 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -203,11 +203,10 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] auto refill_reader_buffer(size_t refill_size) -> ErrorCode; /** - * Discard the data before pos from internal reader buffer and resize the - * buffer - * @param pos + * Discards the data before the current position and resizes the buffer + * accordingly. */ - auto resize_buffer_from_pos(size_t pos) -> void; + auto drop_content_before_current_pos() -> void; /** * return the file_pos's corresponding pos in the internal buffer From 53c5580bf2f2e9caa47f0014a60dd558e79aecb2 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 03:40:22 -0400 Subject: [PATCH 096/121] BufferedFileReader: Remaining refactoring --- components/core/src/BufferedFileReader.cpp | 42 +++++++++++++--------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 7a94f036c..be163f5f3 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -11,7 +11,20 @@ using std::string; namespace { -auto try_read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) +/** + * Reads from the given file descriptor + * @param fd + * @param buf + * @param num_bytes_to_read + * @param num_bytes_read + * @return ErrorCode_errno on error + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Success on success + */ +auto read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode; + +auto read_into_buffer(int fd, char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode { num_bytes_read = 0; while (true) { @@ -73,9 +86,9 @@ auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { auto error_code = m_buffer_reader->try_seek_from_begin(get_buffer_relative_pos(pos)); if (ErrorCode_Truncated == error_code) { if (false == m_checkpoint_pos.has_value()) { - // if checkpoint is not set, simply move the file_pos and invalidate + // If checkpoint is not set, simply move the file_pos and invalidate // the buffer reader - auto offset = lseek(m_fd, static_cast<__off64_t>(pos), SEEK_SET); + auto offset = lseek(m_fd, static_cast(pos), SEEK_SET); if (-1 == offset) { return ErrorCode_errno; } @@ -179,7 +192,7 @@ auto BufferedFileReader::try_read_to_delimiter( auto error_code = refill_reader_buffer(m_base_buffer_size); if (ErrorCode_EndOfFile == error_code) { - if (total_num_bytes_read == 0) { + if (0 == total_num_bytes_read) { return ErrorCode_EndOfFile; } break; @@ -226,18 +239,17 @@ void BufferedFileReader::open(string const& path) { } auto BufferedFileReader::close() -> ErrorCode { - int close_result{0}; if (-1 != m_fd) { - close_result = ::close(m_fd); - - m_fd = -1; if (m_checkpoint_pos.has_value()) { m_buffer.resize(m_base_buffer_size); m_checkpoint_pos.reset(); } - } - if (0 != close_result) { - return ErrorCode_errno; + + auto close_result = ::close(m_fd); + m_fd = -1; + if (0 != close_result) { + return ErrorCode_errno; + } } return ErrorCode_Success; } @@ -314,12 +326,8 @@ auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> Err } size_t num_bytes_read{0}; - auto error_code = try_read_into_buffer( - m_fd, - &m_buffer[next_buffer_pos], - num_bytes_to_read, - num_bytes_read - ); + auto error_code + = read_into_buffer(m_fd, &m_buffer[next_buffer_pos], num_bytes_to_read, num_bytes_read); if (error_code != ErrorCode_Success && ErrorCode_EndOfFile != error_code) { return error_code; } From 64d3a5dac370f14c6dfacb4aeb26b4afb408f27b Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 03:48:33 -0400 Subject: [PATCH 097/121] BufferedFileReader: Fix docstrings --- components/core/src/BufferedFileReader.hpp | 160 +++++++++++---------- 1 file changed, 84 insertions(+), 76 deletions(-) diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 9fa21fa3d..9c6d49041 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -14,11 +14,25 @@ #include "TraceableException.hpp" /** - * Class for reading from a on-disk file with custom buffering. - * The BufferedFileReader is designed to support files that only allow - * sequential access, such as files in S3. The class uses a checkpoint - * mechanism to support seeking and reading from a previous file position - * without having to actually accessing the file. + * Class for performing buffered (in memory) reads from an on-disk file with + * control over when and how much data is buffered. This allows us to support + * use cases where we want to perform unordered reads from files which only + * support sequential access (e.g. files from block storage like S3). + * + * To control how much data is buffered, we allow callers to set a checkpoint + * such that all reads and seeks past the checkpoint will be buffered until the + * checkpoint is cleared. This allows callers to perform random seeks and reads + * of any data after (and including) the checkpoint. When no checkpoint is set, + * we maintain a fixed-size buffer. + * + * NOTE 1: Unless otherwise noted, the "file position" mentioned in docstrings + * is the position in the buffered file, not the position in the on-disk file. + * + * NOTE 2: This class restricts the buffer size to a multiple of the page size + * and we avoid reading anything less than a page to avoid multiple page faults. + * + * NOTE 3: Although the FILE stream interface provided by glibc also performs + * buffered reads, it does not allow us to control the buffering. */ class BufferedFileReader : public ReaderInterface { public: @@ -52,14 +66,22 @@ class BufferedFileReader : public ReaderInterface { std::string m_message; }; + // Constants + static constexpr size_t cMinBufferSize = (1ULL << 12); + // Constructors - BufferedFileReader(size_t base_buffer_size); + /** + * @param base_buffer_size The size for the fixed-size buffer used when no + * checkpoint is set. It must be a multiple of + * BufferedFileReader::cMinBufferSize. + */ + explicit BufferedFileReader(size_t base_buffer_size); BufferedFileReader() : BufferedFileReader(cDefaultBufferSize) {} ~BufferedFileReader(); - // explicitly disable copy or move + // Disable copy/move construction/assignment BufferedFileReader(BufferedFileReader const&) = delete; BufferedFileReader(BufferedFileReader&&) = delete; auto operator=(BufferedFileReader) -> BufferedFileReader& = delete; @@ -67,18 +89,26 @@ class BufferedFileReader : public ReaderInterface { // Methods implementing the ReaderInterface /** - * Tries to get the current position of the read head in the file - * @param pos Position of the read head in the file - * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_errno on error + * @param pos Returns the position of the read head in the file + * @return ErrorCode_NotInit if the file isn't open * @return ErrorCode_Success on success */ [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override; + /** - * Tries to seek from the beginning of the file to the given position + * Tries to seek to the given position relative to the beginning of the + * file. When no checkpoint is set, callers can only seek forwards in the + * file; When a checkpoint is set, callers can seek to any position in the + * file that's after and including the checkpoint. * @param pos - * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_errno on error + * @return ErrorCode_NotInit if the file isn't open + * @return ErrorCode_Unsupported if a checkpoint is set and the requested + * position is less than the checkpoint, or no checkpoint is set and the + * requested position is less the current read head's position. + * @return ErrorCode_Truncated if we reached the end of the file before we + * reached the given position + * @return ErrorCode_errno on error reading from the underlying file + * @return Same as BufferReader::try_seek_from_begin if it fails * @return ErrorCode_Success on success */ [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; @@ -89,8 +119,8 @@ class BufferedFileReader : public ReaderInterface { * @param num_bytes_to_read The number of bytes to try and read * @param num_bytes_read The actual number of bytes read * @return ErrorCode_NotInit if the file is not open - * @return ErrorCode_BadParam if buf is invalid - * @return ErrorCode_errno on error + * @return ErrorCode_BadParam if buf is null + * @return ErrorCode_errno on error reading from the underlying file * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Success on success */ @@ -98,25 +128,24 @@ class BufferedFileReader : public ReaderInterface { -> ErrorCode override; /** - * Tries to read a string from the file until it reaches - * the specified delimiter - * @param delim The delimiter to stop at - * @param keep_delimiter Whether to include the delimiter in the - * output string or not - * @param append Whether to append to the given string or - * replace its contents - * @param str The string read - * @return ErrorCode_Success on success + * Tries to read up to an occurrence of the given delimiter + * @param delim + * @param keep_delimiter Whether to include the delimiter in the output + * string + * @param append Whether to append to the given string or replace its + * contents + * @param str Returns the content read + * @return ErrorCode_NotInit if the file is not open * @return ErrorCode_EndOfFile on EOF - * @return ErrorCode_errno otherwise + * @return ErrorCode_errno on error reading from the underlying file + * @return Same as BufferReader::try_read_to_delimiter if it fails + * @return ErrorCode_Success on success */ [[nodiscard]] auto try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) -> ErrorCode override; // Methods - [[nodiscard]] auto is_open() const -> bool { return -1 != m_fd; } - /** * Tries to open a file * @param path @@ -128,7 +157,6 @@ class BufferedFileReader : public ReaderInterface { /** * Opens a file * @param path - * @throw BufferedFileReader::OperationFailed on failure */ auto open(std::string const& path) -> void; /** @@ -139,68 +167,50 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } /** - * Peeks the buffer without advancing the file - * pos. - * Note: If further operation such as read or peek is called on the - * BufferedFileReader after peek_buffered_data, the buf could - * point to invalid data - * @param buf pointer pointing to peeked data - * @param peek_size returns number of bytes peeked by reference - * @return ErrorCode_Success on success - * @return ErrorCode_errno on error + * Peeks the remaining buffered content without advancing the read head. + * + * NOTE: Any subsequent read or seek operations may invalidate the returned + * buffer. + * @param buf Returns a pointer to the remaining content in the buffer + * @param peek_size Returns the size of the remaining content in the buffer * @return ErrorCode_NotInit if the file is not opened - * @return ErrorCode_EndOfFile if already reaching the eof + * @return ErrorCode_errno on on error reading from the underlying file + * @return ErrorCode_EndOfFile if we've already reached EOF + * @return ErrorCode_Success on success */ [[nodiscard]] auto peek_buffered_data(char const*& data_ptr, size_t& peek_size) -> ErrorCode; /** - * Sets a checkpoint at the current file pos. - * By default, the checkpoint is not set and the BufferedFileReader only - * maintains a fixed size buffer. Seeking before the reading pos is not - * supported since the data might not be in the buffer anymore. + * Sets a checkpoint at the current position in the file. If a checkpoint is + * already set, this method will discard any buffered content from before + * the current checkpoint. * - * When the checkpoint is set, the BufferedFileReader increases its - * internal buffer size on demand and buffer all data between the - * checkpoint pos and largest ever file_pos in the memory. - * It then support seeking back to a previous file pos that's after the - * checkpoint pos, as the data is guaranteed to be available in the internal - * buffer. - * - * Note: Setting a checkpoint may result in higher memory usage since - * the BufferedFileReader needs to exhaustively buffer the data it reads - * in the buffer. - * @return current file pos + * NOTE: Setting a checkpoint may result in higher memory usage since the + * BufferedFileReader needs to buffer all the data it reads after the + * checkpoint. + * @return The current position in the file */ auto set_checkpoint() -> size_t; /** - * Disable the checkpoint pos and release buffered data from memory - * The function resize the internal buffer based on the following rules. - * 1. If the current reading_pos is within the same m_buffer_size region as - * the buffer end pos (the file pos that end of buffer corresponds to). i.e. - * buffer_end_pos - file_pos < m_buffer_size - * the buffer will be resized to m_buffer_size bytes - * 2. Else, The buffer will be resized to the rounded result of - * quantizing (buffer_end_pos - file_pos) to the nearest multiple of - * 'm_buffer_size' using the rounding method. This ensures that the current - * read pos still resides in the resized buffer + * Clears the current checkpoint and moves the read head to the highest + * position that the caller read/seeked to. This will shrink the buffer to + * its original size, discarding any excess data. */ auto clear_checkpoint() -> void; private: // Methods /** - * Reads next refill_size bytes from file descriptor to the internal buffer - * and sets the data size of the internal buffer - * Note: the function returns success even if the number of bytes read from - * the fd is less than the refill_size + * Refills the buffer with up to the given number of bytes from the + * underlying file. + * + * NOTE: Callers must ensure the current buffer has been exhausted before + * calling this method (i.e., the read head is at the end of the buffer). * @param refill_size - * @return ErrorCode_Success on success - * @return ErrorCode_errno on error - * @return ErrorCode_NotInit if the file is not opened - * @return ErrorCode_EndOfFile if already reaching the eof + * @return Same as read_into_buffer */ - [[nodiscard]] auto refill_reader_buffer(size_t refill_size) -> ErrorCode; + [[nodiscard]] auto refill_reader_buffer(size_t num_bytes_to_refill) -> ErrorCode; /** * Discards the data before the current position and resizes the buffer @@ -209,9 +219,8 @@ class BufferedFileReader : public ReaderInterface { auto drop_content_before_current_pos() -> void; /** - * return the file_pos's corresponding pos in the internal buffer * @param file_pos - * @return + * @return \p file_pos relative to the beginning of the buffer */ [[nodiscard]] auto get_buffer_relative_pos(size_t file_pos) const -> size_t { return file_pos - m_buffer_begin_pos; @@ -224,7 +233,6 @@ class BufferedFileReader : public ReaderInterface { auto update_file_pos(size_t pos) -> void; // Constants - static constexpr size_t cMinBufferSize = (1ULL << 12); static constexpr size_t cDefaultBufferSize = (16 * cMinBufferSize); // Variables From 455b3cd30f268c32f4e73fb2ad9dab60d5d9cfaa Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 04:34:07 -0400 Subject: [PATCH 098/121] Replace off64_t with generic type. --- components/core/src/BufferedFileReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index be163f5f3..bd22177a6 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -88,7 +88,7 @@ auto BufferedFileReader::try_seek_from_begin(size_t pos) -> ErrorCode { if (false == m_checkpoint_pos.has_value()) { // If checkpoint is not set, simply move the file_pos and invalidate // the buffer reader - auto offset = lseek(m_fd, static_cast(pos), SEEK_SET); + auto offset = lseek(m_fd, static_cast(pos), SEEK_SET); if (-1 == offset) { return ErrorCode_errno; } From 08581d76a9cebc8510dd9b5e6fc4472b96f1f9ba Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 08:52:06 -0400 Subject: [PATCH 099/121] BufferedFileReader: Reorder methods according to guidelines. --- components/core/src/BufferedFileReader.cpp | 182 ++++++++++----------- components/core/src/BufferedFileReader.hpp | 108 ++++++------ 2 files changed, 145 insertions(+), 145 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index bd22177a6..88385d3d4 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -62,6 +62,97 @@ BufferedFileReader::~BufferedFileReader() { std::ignore = close(); } +auto BufferedFileReader::try_open(string const& path) -> ErrorCode { + // Cleanup in case caller forgot to call close before calling this function + std::ignore = close(); + + m_fd = ::open(path.c_str(), O_RDONLY); + if (-1 == m_fd) { + if (ENOENT == errno) { + return ErrorCode_FileNotFound; + } + return ErrorCode_errno; + } + m_path = path; + m_file_pos = 0; + m_buffer_begin_pos = 0; + m_buffer_reader.emplace(m_buffer.data(), 0); + m_highest_read_pos = 0; + return ErrorCode_Success; +} + +void BufferedFileReader::open(string const& path) { + auto const error_code = try_open(path); + if (ErrorCode_Success != error_code) { + if (ErrorCode_FileNotFound == error_code) { + throw OperationFailed( + error_code, + __FILENAME__, + __LINE__, + "File not found: " + boost::filesystem::weakly_canonical(path).string() + ); + } + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +auto BufferedFileReader::close() -> ErrorCode { + if (-1 != m_fd) { + if (m_checkpoint_pos.has_value()) { + m_buffer.resize(m_base_buffer_size); + m_checkpoint_pos.reset(); + } + + auto close_result = ::close(m_fd); + m_fd = -1; + if (0 != close_result) { + return ErrorCode_errno; + } + } + return ErrorCode_Success; +} + +auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) -> ErrorCode { + if (-1 == m_fd) { + return ErrorCode_NotInit; + } + // Refill the buffer if it is not loaded yet + if (0 == m_buffer_reader->get_buffer_size()) { + auto error_code = refill_reader_buffer(m_base_buffer_size); + if (ErrorCode_Success != error_code) { + buf = nullptr; + peek_size = 0; + return error_code; + } + } + m_buffer_reader->peek_buffer(buf, peek_size); + return ErrorCode_Success; +} + +auto BufferedFileReader::set_checkpoint() -> size_t { + if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos + && m_buffer_reader->get_buffer_size() != m_base_buffer_size) + { + drop_content_before_current_pos(); + } + m_checkpoint_pos = m_file_pos; + return m_file_pos; +} + +auto BufferedFileReader::clear_checkpoint() -> void { + if (false == m_checkpoint_pos.has_value()) { + return; + } + + auto error_code = try_seek_from_begin(m_highest_read_pos); + if (ErrorCode_Success != error_code) { + // Should never happen + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + drop_content_before_current_pos(); + m_checkpoint_pos.reset(); +} + auto BufferedFileReader::try_get_pos(size_t& pos) -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; @@ -204,97 +295,6 @@ auto BufferedFileReader::try_read_to_delimiter( return ErrorCode_Success; } -auto BufferedFileReader::try_open(string const& path) -> ErrorCode { - // Cleanup in case caller forgot to call close before calling this function - std::ignore = close(); - - m_fd = ::open(path.c_str(), O_RDONLY); - if (-1 == m_fd) { - if (ENOENT == errno) { - return ErrorCode_FileNotFound; - } - return ErrorCode_errno; - } - m_path = path; - m_file_pos = 0; - m_buffer_begin_pos = 0; - m_buffer_reader.emplace(m_buffer.data(), 0); - m_highest_read_pos = 0; - return ErrorCode_Success; -} - -void BufferedFileReader::open(string const& path) { - auto const error_code = try_open(path); - if (ErrorCode_Success != error_code) { - if (ErrorCode_FileNotFound == error_code) { - throw OperationFailed( - error_code, - __FILENAME__, - __LINE__, - "File not found: " + boost::filesystem::weakly_canonical(path).string() - ); - } - throw OperationFailed(error_code, __FILENAME__, __LINE__); - } -} - -auto BufferedFileReader::close() -> ErrorCode { - if (-1 != m_fd) { - if (m_checkpoint_pos.has_value()) { - m_buffer.resize(m_base_buffer_size); - m_checkpoint_pos.reset(); - } - - auto close_result = ::close(m_fd); - m_fd = -1; - if (0 != close_result) { - return ErrorCode_errno; - } - } - return ErrorCode_Success; -} - -auto BufferedFileReader::set_checkpoint() -> size_t { - if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos - && m_buffer_reader->get_buffer_size() != m_base_buffer_size) - { - drop_content_before_current_pos(); - } - m_checkpoint_pos = m_file_pos; - return m_file_pos; -} - -auto BufferedFileReader::clear_checkpoint() -> void { - if (false == m_checkpoint_pos.has_value()) { - return; - } - - auto error_code = try_seek_from_begin(m_highest_read_pos); - if (ErrorCode_Success != error_code) { - // Should never happen - throw OperationFailed(error_code, __FILENAME__, __LINE__); - } - drop_content_before_current_pos(); - m_checkpoint_pos.reset(); -} - -auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) -> ErrorCode { - if (-1 == m_fd) { - return ErrorCode_NotInit; - } - // Refill the buffer if it is not loaded yet - if (0 == m_buffer_reader->get_buffer_size()) { - auto error_code = refill_reader_buffer(m_base_buffer_size); - if (ErrorCode_Success != error_code) { - buf = nullptr; - peek_size = 0; - return error_code; - } - } - m_buffer_reader->peek_buffer(buf, peek_size); - return ErrorCode_Success; -} - auto BufferedFileReader::refill_reader_buffer(size_t num_bytes_to_refill) -> ErrorCode { auto const buffer_end_pos = get_buffer_end_pos(); auto const data_size = m_buffer_reader->get_buffer_size(); diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 9c6d49041..2a8bc00c9 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -87,6 +87,60 @@ class BufferedFileReader : public ReaderInterface { auto operator=(BufferedFileReader) -> BufferedFileReader& = delete; auto operator=(BufferedFileReader&&) -> BufferedFileReader& = delete; + // Methods + /** + * Tries to open a file + * @param path + * @return ErrorCode_Success on success + * @return ErrorCode_FileNotFound if the file was not found + * @return ErrorCode_errno otherwise + */ + [[nodiscard]] auto try_open(std::string const& path) -> ErrorCode; + + auto open(std::string const& path) -> void; + + /** + * Closes the file if it's open + * @return ErrorCode_errno on error closing the underlying file + * @return ErrorCode_Success on success + */ + [[nodiscard]] auto close() -> ErrorCode; + + [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } + + /** + * Peeks the remaining buffered content without advancing the read head. + * + * NOTE: Any subsequent read or seek operations may invalidate the returned + * buffer. + * @param buf Returns a pointer to the remaining content in the buffer + * @param peek_size Returns the size of the remaining content in the buffer + * @return ErrorCode_NotInit if the file is not opened + * @return ErrorCode_errno on on error reading from the underlying file + * @return ErrorCode_EndOfFile if we've already reached EOF + * @return ErrorCode_Success on success + */ + [[nodiscard]] auto peek_buffered_data(char const*& data_ptr, size_t& peek_size) -> ErrorCode; + + /** + * Sets a checkpoint at the current position in the file. If a checkpoint is + * already set, this method will discard any buffered content from before + * the current checkpoint. + * + * NOTE: Setting a checkpoint may result in higher memory usage since the + * BufferedFileReader needs to buffer all the data it reads after the + * checkpoint. + * @return The current position in the file + */ + auto set_checkpoint() -> size_t; + + /** + * Clears the current checkpoint and moves the read head to the highest + * position that the caller read/seeked to. This will shrink the buffer to + * its original size, discarding any excess data. + */ + auto clear_checkpoint() -> void; + // Methods implementing the ReaderInterface /** * @param pos Returns the position of the read head in the file @@ -145,60 +199,6 @@ class BufferedFileReader : public ReaderInterface { try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str) -> ErrorCode override; - // Methods - /** - * Tries to open a file - * @param path - * @return ErrorCode_Success on success - * @return ErrorCode_FileNotFound if the file was not found - * @return ErrorCode_errno otherwise - */ - [[nodiscard]] auto try_open(std::string const& path) -> ErrorCode; - /** - * Opens a file - * @param path - */ - auto open(std::string const& path) -> void; - /** - * Closes the file if it's open - */ - [[nodiscard]] auto close() -> ErrorCode; - - [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } - - /** - * Peeks the remaining buffered content without advancing the read head. - * - * NOTE: Any subsequent read or seek operations may invalidate the returned - * buffer. - * @param buf Returns a pointer to the remaining content in the buffer - * @param peek_size Returns the size of the remaining content in the buffer - * @return ErrorCode_NotInit if the file is not opened - * @return ErrorCode_errno on on error reading from the underlying file - * @return ErrorCode_EndOfFile if we've already reached EOF - * @return ErrorCode_Success on success - */ - [[nodiscard]] auto peek_buffered_data(char const*& data_ptr, size_t& peek_size) -> ErrorCode; - - /** - * Sets a checkpoint at the current position in the file. If a checkpoint is - * already set, this method will discard any buffered content from before - * the current checkpoint. - * - * NOTE: Setting a checkpoint may result in higher memory usage since the - * BufferedFileReader needs to buffer all the data it reads after the - * checkpoint. - * @return The current position in the file - */ - auto set_checkpoint() -> size_t; - - /** - * Clears the current checkpoint and moves the read head to the highest - * position that the caller read/seeked to. This will shrink the buffer to - * its original size, discarding any excess data. - */ - auto clear_checkpoint() -> void; - private: // Methods /** From c270b7990314ef22300b06d552e490ac4613346c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:16:39 -0400 Subject: [PATCH 100/121] Make peek functions const --- components/core/src/BufferReader.cpp | 2 +- components/core/src/BufferReader.hpp | 2 +- components/core/src/BufferedFileReader.cpp | 34 +++++++++---- components/core/src/BufferedFileReader.hpp | 32 +++++++++++-- components/core/src/LibarchiveFileReader.cpp | 46 +++++++++--------- components/core/src/LibarchiveFileReader.hpp | 21 +++++--- components/core/src/clp/FileCompressor.cpp | 50 +++++++++++++------- 7 files changed, 126 insertions(+), 61 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 54ad250d6..8c0fca813 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -12,7 +12,7 @@ BufferReader::BufferReader(char const* data, size_t data_size, size_t pos) { m_internal_buf_pos = pos; } -auto BufferReader::peek_buffer(char const*& buf, size_t& peek_size) -> void { +auto BufferReader::peek_buffer(char const*& buf, size_t& peek_size) const -> void { peek_size = get_remaining_data_size(); buf = m_internal_buf + m_internal_buf_pos; } diff --git a/components/core/src/BufferReader.hpp b/components/core/src/BufferReader.hpp index 337bcb88a..d52f56171 100644 --- a/components/core/src/BufferReader.hpp +++ b/components/core/src/BufferReader.hpp @@ -33,7 +33,7 @@ class BufferReader : public ReaderInterface { * @param buf Returns a pointer to the remaining content in the buffer * @param peek_size Returns the size of the remaining content in the buffer */ - auto peek_buffer(char const*& buf, size_t& peek_size) -> void; + auto peek_buffer(char const*& buf, size_t& peek_size) const -> void; /** * Tries to read up to an occurrence of the given delimiter diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 88385d3d4..310831346 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -112,23 +112,39 @@ auto BufferedFileReader::close() -> ErrorCode { return ErrorCode_Success; } -auto BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) -> ErrorCode { +auto BufferedFileReader::try_refill_buffer_if_empty() -> ErrorCode { if (-1 == m_fd) { return ErrorCode_NotInit; } - // Refill the buffer if it is not loaded yet - if (0 == m_buffer_reader->get_buffer_size()) { - auto error_code = refill_reader_buffer(m_base_buffer_size); - if (ErrorCode_Success != error_code) { - buf = nullptr; - peek_size = 0; - return error_code; - } + if (m_buffer_reader->get_buffer_size() > 0) { + return ErrorCode_Success; + } + return refill_reader_buffer(m_base_buffer_size); +} + +void BufferedFileReader::refill_buffer_if_empty() { + auto error_code = try_refill_buffer_if_empty(); + if (ErrorCode_Success != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +auto BufferedFileReader::try_peek_buffered_data(char const*& buf, size_t& peek_size) const + -> ErrorCode { + if (-1 == m_fd) { + return ErrorCode_NotInit; } m_buffer_reader->peek_buffer(buf, peek_size); return ErrorCode_Success; } +void BufferedFileReader::peek_buffered_data(const char *& buf, size_t& peek_size) const { + auto error_code = try_peek_buffered_data(buf, peek_size); + if (ErrorCode_Success != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + auto BufferedFileReader::set_checkpoint() -> size_t { if (m_checkpoint_pos.has_value() && m_checkpoint_pos < m_file_pos && m_buffer_reader->get_buffer_size() != m_base_buffer_size) diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 2a8bc00c9..2605423d3 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -109,18 +109,42 @@ class BufferedFileReader : public ReaderInterface { [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } /** - * Peeks the remaining buffered content without advancing the read head. + * Tries to fill the internal buffer if it's empty + * @return ErrorCode_NotInit if the file is not opened + * @return ErrorCode_errno on error reading from the underlying file + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Success on success + */ + [[nodiscard]] auto try_refill_buffer_if_empty() -> ErrorCode; + + /** + * Fills the internal buffer if it's empty + */ + void refill_buffer_if_empty(); + + /** + * Tries to peek the remaining buffered content without advancing the read + * head. * * NOTE: Any subsequent read or seek operations may invalidate the returned * buffer. * @param buf Returns a pointer to the remaining content in the buffer * @param peek_size Returns the size of the remaining content in the buffer * @return ErrorCode_NotInit if the file is not opened - * @return ErrorCode_errno on on error reading from the underlying file - * @return ErrorCode_EndOfFile if we've already reached EOF * @return ErrorCode_Success on success */ - [[nodiscard]] auto peek_buffered_data(char const*& data_ptr, size_t& peek_size) -> ErrorCode; + [[nodiscard]] auto try_peek_buffered_data(char const*& buf, size_t& peek_size) const + -> ErrorCode; + + /** + * Peeks the remaining buffered content without advancing the read head. + * + * NOTE: Any subsequent read or seek operations may invalidate the returned + * buffer. + * @param buf Returns a pointer to the remaining content in the buffer + * @param peek_size Returns the size of the remaining content in the buffer + */ + void peek_buffered_data(char const*& buf, size_t& peek_size) const; /** * Sets a checkpoint at the current position in the file. If a checkpoint is diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index db488475b..64c037505 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -167,7 +167,7 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter (char delim, bool keep_del return ErrorCode_Success; } -ErrorCode LibarchiveFileReader::try_peek_buffered_data(char const*& buf, size_t& buf_size) { +ErrorCode LibarchiveFileReader::try_load_data_block() { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -175,33 +175,33 @@ ErrorCode LibarchiveFileReader::try_peek_buffered_data(char const*& buf, size_t& throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - // Read a data block if necessary - if (nullptr == m_data_block) { - auto error_code = read_next_data_block(); - if (ErrorCode_Success != error_code) { - buf = nullptr; - buf_size = 0; - return error_code; - } + if (m_data_block != nullptr) { + return ErrorCode_Success; } + return read_next_data_block(); +} - if (m_data_block_pos_in_file <= m_pos_in_file) { - // Position in the file is within the data block, so we can directly - // return a const pointer to the current data block +void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size) const { + if (nullptr == m_archive) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + if (nullptr == m_archive_entry) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + if (m_pos_in_file < m_data_block_pos_in_file) { + // Position in the file is before the current data block, so we return + // nulls corresponding to the sparse bytes before the data block + // NOTE: We don't return ALL sparse bytes before the data block since + // that might require allocating more bytes, violating the const-ness of + // this method. Since peek is a best-effort method, this should be + // sufficient for most callers. + buf = m_nulls_for_peek.data(); + buf_size = std::min(m_nulls_for_peek.size(), m_data_block_pos_in_file - m_pos_in_file); + } else { buf_size = m_data_block_length - m_pos_in_data_block; buf = static_cast(m_data_block); - return ErrorCode_Success; } - - auto num_sparse_bytes = m_data_block_pos_in_file - m_pos_in_file; - buf_size = num_sparse_bytes + m_data_block_length; - m_data_for_peek.resize(buf_size, '\0'); - buf = static_cast(m_data_for_peek.data()); - - size_t const remaining_bytes_to_peek = buf_size - num_sparse_bytes; - memcpy(&m_data_for_peek[num_sparse_bytes], m_data_block, remaining_bytes_to_peek); - - return ErrorCode_Success; } void LibarchiveFileReader::open (struct archive* archive, struct archive_entry* archive_entry) { diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index 063e83f72..dd2ea1731 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -2,8 +2,8 @@ #define LIBARCHIVEFILEREADER_HPP // C++ standard libraries +#include #include -#include // libarchive #include @@ -71,13 +71,22 @@ class LibarchiveFileReader : public ReaderInterface { ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; /** - * @param buf Returns a pointer to any buffered data - * @param buf_size Returns the number of bytes in the buffer + * Tries to the load a data block from the file if none is loaded * @return ErrorCode_EndOfFile on EOF * @return ErrorCode_Failure on failure * @return ErrorCode_Success on success */ - [[nodiscard]] ErrorCode try_peek_buffered_data(char const*& buf, size_t& buf_size); + [[nodiscard]] ErrorCode try_load_data_block(); + + /** + * Peeks the remaining buffered content without advancing the read head. + * + * NOTE: Any subsequent read or seek operations may invalidate the returned + * buffer. + * @param buf Returns a pointer to any buffered data + * @param buf_size Returns the number of bytes in the buffer + */ + void peek_buffered_data(char const*& buf, size_t& buf_size) const; // Methods /** @@ -113,8 +122,8 @@ class LibarchiveFileReader : public ReaderInterface { size_t m_pos_in_file; - // vector to hold peeked data - std::vector m_data_for_peek; + // Nulls for peek + std::array m_nulls_for_peek{0}; }; #endif // LIBARCHIVEFILEREADER_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 75618a417..c11c9566f 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -90,13 +90,25 @@ namespace clp { m_file_reader.open(file_to_compress.get_path()); // Check that file is UTF-8 encoded - if (auto error_code = m_file_reader.peek_buffered_data(m_utf8_validation_buf, - m_utf8_validation_buf_length); - ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) { - SPDLOG_ERROR("Failed to peek data from {}, errno={}", - file_to_compress.get_path().c_str(), errno); - return error_code; + if (auto error_code = m_file_reader.try_refill_buffer_if_empty(); + ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) + { + if (ErrorCode_errno == error_code) { + SPDLOG_ERROR( + "Failed to read {} into buffer, errno={}", + file_to_compress.get_path(), + errno + ); + } else { + SPDLOG_ERROR( + "Failed to read {} into buffer, error={}", + file_to_compress.get_path(), + error_code + ); + } + return false; } + m_file_reader.peek_buffered_data(m_utf8_validation_buf, m_utf8_validation_buf_length); bool succeeded = true; if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { if (use_heuristic) { @@ -248,20 +260,24 @@ namespace clp { } m_libarchive_reader.open_file_reader(m_libarchive_file_reader); - error_code = m_libarchive_file_reader.try_peek_buffered_data( + + // Check that file is UTF-8 encoded + if (auto error_code = m_libarchive_file_reader.try_load_data_block(); + ErrorCode_Success != error_code && ErrorCode_EndOfFile != error_code) + { + SPDLOG_ERROR( + "Failed to load data block from {}, error={}", + file_to_compress.get_path(), + error_code + ); + m_libarchive_file_reader.close(); + succeeded = false; + continue; + } + m_libarchive_file_reader.peek_buffered_data( m_utf8_validation_buf, m_utf8_validation_buf_length ); - // Check that file is UTF-8 encoded - if (ErrorCode_Success != error_code) { - if (ErrorCode_EndOfFile != error_code) { - SPDLOG_ERROR("Failed to peek data from {}, errno={}", - file_to_compress.get_path().c_str(), errno); - m_libarchive_file_reader.close(); - succeeded = false; - continue; - } - } if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { auto boost_path_for_compression = parent_boost_path / m_libarchive_reader.get_path(); if (use_heuristic) { From 07fba9756077c3161b7a4ad9001fba4706dbf9bb Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:29:00 -0400 Subject: [PATCH 101/121] Undo my mistake for close --- components/core/src/BufferedFileReader.cpp | 28 +++++++++---------- components/core/src/BufferedFileReader.hpp | 4 +-- components/core/src/clp/FileCompressor.cpp | 2 +- .../core/tests/test-BufferedFileReader.cpp | 4 +-- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index 310831346..e9718fd3e 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -59,12 +59,12 @@ BufferedFileReader::BufferedFileReader(size_t base_buffer_size) { } BufferedFileReader::~BufferedFileReader() { - std::ignore = close(); + close(); } auto BufferedFileReader::try_open(string const& path) -> ErrorCode { // Cleanup in case caller forgot to call close before calling this function - std::ignore = close(); + close(); m_fd = ::open(path.c_str(), O_RDONLY); if (-1 == m_fd) { @@ -96,20 +96,20 @@ void BufferedFileReader::open(string const& path) { } } -auto BufferedFileReader::close() -> ErrorCode { - if (-1 != m_fd) { - if (m_checkpoint_pos.has_value()) { - m_buffer.resize(m_base_buffer_size); - m_checkpoint_pos.reset(); - } +auto BufferedFileReader::close() -> void { + if (-1 == m_fd) { + return; + } - auto close_result = ::close(m_fd); - m_fd = -1; - if (0 != close_result) { - return ErrorCode_errno; - } + if (m_checkpoint_pos.has_value()) { + m_buffer.resize(m_base_buffer_size); + m_checkpoint_pos.reset(); } - return ErrorCode_Success; + + // NOTE: We don't check errors for close since, in the read case, it seems + // the only reason it could fail is if it was interrupted by a signal + ::close(m_fd); + m_fd = -1; } auto BufferedFileReader::try_refill_buffer_if_empty() -> ErrorCode { diff --git a/components/core/src/BufferedFileReader.hpp b/components/core/src/BufferedFileReader.hpp index 2605423d3..7cdd659a6 100644 --- a/components/core/src/BufferedFileReader.hpp +++ b/components/core/src/BufferedFileReader.hpp @@ -101,10 +101,8 @@ class BufferedFileReader : public ReaderInterface { /** * Closes the file if it's open - * @return ErrorCode_errno on error closing the underlying file - * @return ErrorCode_Success on success */ - [[nodiscard]] auto close() -> ErrorCode; + auto close() -> void; [[nodiscard]] auto get_path() const -> std::string const& { return m_path; } diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index c11c9566f..9c546d6da 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -128,7 +128,7 @@ namespace clp { } } - std::ignore = m_file_reader.close(); + m_file_reader.close(); Profiler::stop_continuous_measurement(); LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::ParseLogFile) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 707a9a1fc..827ae970b 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -56,7 +56,7 @@ TEST_CASE("Test reading data", "[BufferedFileReader]") { size_t read_size4 = 1; REQUIRE(ErrorCode_EndOfFile == file_reader.try_read(read_buffer + buffer_offset, read_size4, num_bytes_read)); - std::ignore = file_reader.close(); + file_reader.close(); } SECTION("Simple Seek without checkpoint") { @@ -238,5 +238,5 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { } ref_file_reader.close(); - std::ignore = file_reader.close(); + file_reader.close(); } From 1b5e51a6d442f7944e68751be60ed1df7ff4fa60 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:37:46 -0400 Subject: [PATCH 102/121] FileCompressor: Clean-up --- components/core/src/clp/FileCompressor.cpp | 17 +++++++++-------- components/core/src/clp/FileCompressor.hpp | 2 -- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 9c546d6da..b64e1f951 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -108,9 +108,11 @@ namespace clp { } return false; } - m_file_reader.peek_buffered_data(m_utf8_validation_buf, m_utf8_validation_buf_length); + char const* utf8_validation_buf{nullptr}; + size_t utf8_validation_buf_len{0}; + m_file_reader.peek_buffered_data(utf8_validation_buf, utf8_validation_buf_len); bool succeeded = true; - if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { + if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { if (use_heuristic) { parse_and_encode_with_heuristic(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, file_to_compress.get_path_for_compression(), @@ -148,9 +150,6 @@ namespace clp { archive_writer.m_target_encoded_file_size = target_encoded_file_size; // Open compressed file archive_writer.create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - // TODO: decide what to actually do about this - // for now reset reader rather than try reading m_utf8_validation_buf as it would be - // very awkward to combine sources to/in the parser m_log_parser->set_archive_writer_ptr(&archive_writer); m_log_parser->get_archive_writer_ptr()->old_ts_pattern.clear(); try { @@ -274,11 +273,13 @@ namespace clp { succeeded = false; continue; } + char const* utf8_validation_buf{nullptr}; + size_t utf8_validation_buf_len{0}; m_libarchive_file_reader.peek_buffered_data( - m_utf8_validation_buf, - m_utf8_validation_buf_length + utf8_validation_buf, + utf8_validation_buf_len ); - if (is_utf8_sequence(m_utf8_validation_buf_length, m_utf8_validation_buf)) { + if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { auto boost_path_for_compression = parent_boost_path / m_libarchive_reader.get_path(); if (use_heuristic) { parse_and_encode_with_heuristic(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index e2fe3d92d..2db87c9d2 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -80,8 +80,6 @@ namespace clp { BufferedFileReader m_file_reader; LibarchiveReader m_libarchive_reader; LibarchiveFileReader m_libarchive_file_reader; - const char* m_utf8_validation_buf; - size_t m_utf8_validation_buf_length; MessageParser m_message_parser; ParsedMessage m_parsed_message; std::unique_ptr m_log_parser; From 9cdfe578e5123d0d6cb0e908d001b6f7290724b1 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:40:17 -0400 Subject: [PATCH 103/121] LibarchiveFileReader: Move new methods into the right section. --- components/core/src/LibarchiveFileReader.cpp | 58 ++++++++++---------- components/core/src/LibarchiveFileReader.hpp | 24 ++++---- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 64c037505..4dc963d9a 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -167,6 +167,35 @@ ErrorCode LibarchiveFileReader::try_read_to_delimiter (char delim, bool keep_del return ErrorCode_Success; } +void LibarchiveFileReader::open (struct archive* archive, struct archive_entry* archive_entry) { + if (nullptr == archive) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + if (nullptr == archive_entry) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + if (nullptr != m_archive) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + m_archive = archive; + m_archive_entry = archive_entry; +} + +void LibarchiveFileReader::close () { + if (nullptr == m_archive) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + m_archive = nullptr; + m_archive_entry = nullptr; + + m_data_block = nullptr; + m_reached_eof = false; + + m_pos_in_file = 0; +} + ErrorCode LibarchiveFileReader::try_load_data_block() { if (nullptr == m_archive) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); @@ -204,35 +233,6 @@ void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size } } -void LibarchiveFileReader::open (struct archive* archive, struct archive_entry* archive_entry) { - if (nullptr == archive) { - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); - } - if (nullptr == archive_entry) { - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); - } - if (nullptr != m_archive) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - m_archive = archive; - m_archive_entry = archive_entry; -} - -void LibarchiveFileReader::close () { - if (nullptr == m_archive) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - m_archive = nullptr; - m_archive_entry = nullptr; - - m_data_block = nullptr; - m_reached_eof = false; - - m_pos_in_file = 0; -} - ErrorCode LibarchiveFileReader::read_next_data_block () { auto return_value = archive_read_data_block(m_archive, &m_data_block, &m_data_block_length, &m_data_block_pos_in_file); if (ARCHIVE_OK != return_value) { diff --git a/components/core/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp index dd2ea1731..a7a53d051 100644 --- a/components/core/src/LibarchiveFileReader.hpp +++ b/components/core/src/LibarchiveFileReader.hpp @@ -70,6 +70,18 @@ class LibarchiveFileReader : public ReaderInterface { */ ErrorCode try_read_to_delimiter (char delim, bool keep_delimiter, bool append, std::string& str) override; + // Methods + /** + * Opens the file reader + * @param archive + * @param archive_entry + */ + void open (struct archive* archive, struct archive_entry* archive_entry); + /** + * Closes the file reader + */ + void close (); + /** * Tries to the load a data block from the file if none is loaded * @return ErrorCode_EndOfFile on EOF @@ -88,18 +100,6 @@ class LibarchiveFileReader : public ReaderInterface { */ void peek_buffered_data(char const*& buf, size_t& buf_size) const; - // Methods - /** - * Opens the file reader - * @param archive - * @param archive_entry - */ - void open (struct archive* archive, struct archive_entry* archive_entry); - /** - * Closes the file reader - */ - void close (); - private: // Methods /** From fd74ed9e7074f7e6f55b392ce45d3375bf0d6c96 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:42:25 -0400 Subject: [PATCH 104/121] LibarchiveReader: Space fix. --- components/core/src/LibarchiveReader.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp index 734bc67a9..4712bbfb0 100644 --- a/components/core/src/LibarchiveReader.hpp +++ b/components/core/src/LibarchiveReader.hpp @@ -48,7 +48,7 @@ class LibarchiveReader { * @return ErrorCode_Success on success * @return ErrorCode_Failure on failure */ - ErrorCode try_open (ReaderInterface&reader, const std::string& path_if_compressed_file); + ErrorCode try_open (ReaderInterface& reader, const std::string& path_if_compressed_file); /** * Closes the reader */ From 84028bb373d800a135854e57f73bc54671ea7127 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 11:25:53 -0400 Subject: [PATCH 105/121] BufferedFileReader: Basic refactor of unit tests. --- .../core/tests/test-BufferedFileReader.cpp | 380 ++++++++++-------- 1 file changed, 214 insertions(+), 166 deletions(-) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 827ae970b..8ae0ade68 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -2,221 +2,267 @@ #include #include "../src/BufferedFileReader.hpp" +#include "../src/FileReader.hpp" #include "../src/FileWriter.hpp" +static constexpr size_t cNumAlphabets = 'z' - 'a'; + TEST_CASE("Test reading data", "[BufferedFileReader]") { // Initialize data for testing - size_t test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 - char* test_data = new char[test_data_size]; - char* read_buffer = new char[test_data_size]; - for (size_t i = 0; i < test_data_size; ++i) { - test_data[i] = (char)('a' + (i % 26)); + size_t const test_data_size = 4L * 1024 * 1024 + 1; // 4MB + 1 + auto test_data_uniq_ptr = std::make_unique>(); + auto& test_data = *test_data_uniq_ptr; + for (size_t i = 0; i < test_data.size(); ++i) { + test_data[i] = static_cast('a' + (i % (cNumAlphabets))); } - std::string test_file_path{"BufferedFileReader.test"}; - // write to test file + std::string const test_file_path{"BufferedFileReader.test"}; + // Write to test file FileWriter file_writer; file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); - file_writer.write(test_data, test_data_size); + file_writer.write(test_data.cbegin(), test_data_size); file_writer.close(); - SECTION("General read testing") { - BufferedFileReader file_reader; - file_reader.open(test_file_path); - size_t num_bytes_read{0}; - size_t buffer_offset{0}; + auto read_buf_uniq_ptr = std::make_unique>(); + auto& read_buf = *read_buf_uniq_ptr; + size_t const base_buffer_size = BufferedFileReader::cMinBufferSize << 4; + BufferedFileReader reader{base_buffer_size}; + reader.open(test_file_path); + + size_t num_bytes_read{0}; + size_t buf_pos{0}; + size_t num_bytes_to_read{0}; - // first, read a small chunk of data; - size_t read_size1{1023}; + SECTION("General read testing") { + // Read a small chunk of data; + num_bytes_to_read = base_buffer_size >> 6; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer + buffer_offset, read_size1, num_bytes_read)); - REQUIRE(read_size1 == num_bytes_read); - REQUIRE(0 == memcmp(read_buffer, test_data, read_size1)); - buffer_offset += num_bytes_read; - - // second, read a large chunk of data, so - // BufferedFileReader will refill the internal buffer - size_t read_size2{65'538}; + == reader.try_read(read_buf.begin() + buf_pos, num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data(), buf_pos)); + + // Read a large chunk of data, so BufferedFileReader will refill the + // internal buffer + num_bytes_to_read = base_buffer_size + 2; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer + buffer_offset, read_size2, num_bytes_read)); - REQUIRE(read_size2 == num_bytes_read); - REQUIRE(0 == memcmp(read_buffer, test_data, read_size1 + read_size2)); - buffer_offset += num_bytes_read; - - // third, read remaining data - size_t read_size3 = test_data_size - read_size2 - read_size1; + == reader.try_read(read_buf.begin() + buf_pos, num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data(), buf_pos)); + + // Read remaining data + num_bytes_to_read = test_data_size - buf_pos; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer + buffer_offset, read_size3, num_bytes_read)); - REQUIRE(read_size3 == num_bytes_read); - buffer_offset += num_bytes_read; - - REQUIRE(0 == memcmp(read_buffer, test_data, test_data_size)); - - // lastly, make sure the file reaches eof - size_t read_size4 = 1; + == reader.try_read(read_buf.begin() + buf_pos, num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data(), buf_pos)); + + // Ensure the file reaches EOF + num_bytes_to_read = 1; REQUIRE(ErrorCode_EndOfFile - == file_reader.try_read(read_buffer + buffer_offset, read_size4, num_bytes_read)); - file_reader.close(); + == reader.try_read(read_buf.begin() + buf_pos, num_bytes_to_read, num_bytes_read)); } - SECTION("Simple Seek without checkpoint") { - BufferedFileReader file_reader; - file_reader.open(test_file_path); + SECTION("Simple Seek without a checkpoint") { + num_bytes_to_read = base_buffer_size + 4; - // seek to some random position and do a read - size_t seek_pos1{245}; - size_t num_bytes_to_read{65'540}; - size_t num_byte_read; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos1)); - REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos1, num_bytes_to_read)); + // Seek to some random position + size_t seek_pos{245}; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos)); + buf_pos = seek_pos; + REQUIRE(reader.get_pos() == buf_pos); - // seek front to random position and do a read - size_t seek_pos2{345'212}; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos2)); + // Do a read REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos2, num_bytes_to_read)); - - // the seek should fail on a backward seek when checkpoint is not enabled - REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(seek_pos2)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + seek_pos, num_bytes_to_read)); + + // Seek forwards to another random position + seek_pos = 345'212; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos)); + buf_pos = seek_pos; + REQUIRE(reader.get_pos() == buf_pos); + + // Do a read + REQUIRE(ErrorCode_Success + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + seek_pos, num_bytes_to_read)); + + // Ensure we can't seek backwards when there's no checkpoint + REQUIRE(ErrorCode_Unsupported == reader.try_seek_from_begin(seek_pos)); } - SECTION("seek with checkpoint") { - BufferedFileReader file_reader; - file_reader.open(test_file_path); + SECTION("Seek with a checkpoint") { + // Read some data to advance the read head + num_bytes_to_read = base_buffer_size + 4; + REQUIRE(ErrorCode_Success + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); - size_t num_byte_read; + auto checkpoint_pos = reader.set_checkpoint(); - // first, read some data to advance the file_pos - size_t num_bytes_to_read_1 = 65'540; + // Read some more data + num_bytes_to_read = 345'212; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read_1, num_byte_read)); - REQUIRE(file_reader.get_pos() == num_bytes_to_read_1); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); - // set a checkpoint - size_t checkpoint_pos = file_reader.set_checkpoint(); + size_t highest_file_pos = reader.get_pos(); - // keep reading some data - size_t num_bytes_to_read_2 = 345'212; - REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); - REQUIRE(file_reader.get_pos() == num_bytes_to_read_1 + num_bytes_to_read_2); - size_t latest_file_pos = file_reader.get_pos(); + // Seek backwards to somewhere between the checkpoint and the read head + size_t const seek_pos_1 = checkpoint_pos + 500; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos_1)); + buf_pos = seek_pos_1; + REQUIRE(reader.get_pos() == buf_pos); - // now seek back to somewhere between - size_t seek_pos_1 = checkpoint_pos + 500; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos_1)); + // Read some data REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read_2, num_byte_read)); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos_1, num_bytes_to_read_2)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + + highest_file_pos = std::max(highest_file_pos, reader.get_pos()); - // update the latest_file_pos if necessary - latest_file_pos = std::max(latest_file_pos, file_reader.get_pos()); + // Ensure we can't seek to a position that's before the checkpoint + REQUIRE(ErrorCode_Unsupported == reader.try_seek_from_begin(checkpoint_pos - 1)); + REQUIRE(reader.get_pos() == buf_pos); - // now try to seek back to a pos that's before the checkpoint - REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(checkpoint_pos - 1)); + // Seek back to the highest file pos + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(highest_file_pos)); + buf_pos = highest_file_pos; + REQUIRE(reader.get_pos() == buf_pos); - // now go back to latest data - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(latest_file_pos)); - // make sure data read after latest_file_pos - size_t num_bytes_to_read_3 = 4096; + // Do a read + num_bytes_to_read = 4096; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read_3, num_byte_read)); - REQUIRE(num_bytes_to_read_3 == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, num_bytes_to_read_3)); - // update the latest_file_pos - latest_file_pos = file_reader.get_pos(); - - // seek back to somewhere between the checkpoint and latest data, and set a new checkpoint - file_reader.seek_from_begin((latest_file_pos + checkpoint_pos) / 2); - file_reader.set_checkpoint(); - // the previous seek_pos should be unavailable - REQUIRE(ErrorCode_Unsupported == file_reader.try_seek_from_begin(seek_pos_1)); - - // make sure data read after checkpoint-set are still correct - size_t num_bytes_to_read_4 = 4096; + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + + highest_file_pos = reader.get_pos(); + + // Seek to somewhere between the checkpoint and latest data + size_t const seek_pos_2 = (highest_file_pos + checkpoint_pos) / 2; + reader.seek_from_begin(seek_pos_2); + buf_pos = seek_pos_2; + REQUIRE(reader.get_pos() == buf_pos); + + // Set a new checkpoint + reader.set_checkpoint(); + + // Ensure we can't seek to seek_pos_1 + REQUIRE(ErrorCode_Unsupported == reader.try_seek_from_begin(seek_pos_1)); + + // Do a read + num_bytes_to_read = 4096; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read_4, num_byte_read)); - REQUIRE(num_bytes_to_read_4 == num_byte_read); - REQUIRE(0 - == memcmp( - read_buffer, - test_data + (latest_file_pos + checkpoint_pos) / 2, - num_bytes_to_read_4 - )); - - file_reader.clear_checkpoint(); - size_t default_buffer_size = 65'536; - // make sure data read after checkpoint-reset are still correct; + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + + reader.clear_checkpoint(); + buf_pos = highest_file_pos; + REQUIRE(reader.get_pos() == buf_pos); + + // Do a read + num_bytes_to_read = base_buffer_size; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, default_buffer_size, num_byte_read)); - REQUIRE(default_buffer_size == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + latest_file_pos, default_buffer_size)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); } - SECTION("seek with delayed read") { - BufferedFileReader file_reader; - file_reader.open(test_file_path); - - // first, advance to some random file_pos - size_t begin_read_pos = 45'313; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(begin_read_pos)); - - // set a checkpoint - size_t checkpoint_pos = file_reader.set_checkpoint(); + SECTION("Seek with delayed read") { + // Advance to some random position + size_t seek_pos = 45'313; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos)); + buf_pos = seek_pos; + REQUIRE(reader.get_pos() == buf_pos); - // keep reading some data - size_t num_bytes_to_read; - size_t num_byte_read; + auto checkpoint_pos = reader.set_checkpoint(); + // Do a read num_bytes_to_read = 345'212; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(file_reader.get_pos() == checkpoint_pos + num_bytes_to_read); - REQUIRE(0 == memcmp(read_buffer, test_data + begin_read_pos, num_bytes_to_read)); - - // now seek back to somewhere between - size_t seek_pos = file_reader.get_pos() / 2; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + + // Seek to somewhere between the checkpoint and the read head + seek_pos = reader.get_pos() / 2; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos)); + buf_pos = seek_pos; + REQUIRE(reader.get_pos() == buf_pos); + + // Do a read REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); - - // test a seek that reaches the end of the file - num_bytes_to_read = 500; - seek_pos = test_data_size - num_bytes_to_read; - REQUIRE(ErrorCode_Success == file_reader.try_seek_from_begin(seek_pos)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); + + // Seek close to the end of the file + seek_pos = test_data_size - 500; + REQUIRE(ErrorCode_Success == reader.try_seek_from_begin(seek_pos)); + buf_pos = seek_pos; + REQUIRE(reader.get_pos() == buf_pos); + + // Do a read + num_bytes_to_read = test_data_size - seek_pos; REQUIRE(ErrorCode_Success - == file_reader.try_read(read_buffer, num_bytes_to_read, num_byte_read)); - REQUIRE(num_bytes_to_read == num_byte_read); - REQUIRE(0 == memcmp(read_buffer, test_data + seek_pos, num_bytes_to_read)); + == reader.try_read(read_buf.data(), num_bytes_to_read, num_bytes_read)); + REQUIRE(num_bytes_to_read == num_bytes_read); + REQUIRE(0 == memcmp(read_buf.data(), test_data.data() + buf_pos, num_bytes_to_read)); + buf_pos += num_bytes_read; + REQUIRE(reader.get_pos() == buf_pos); } - delete[] test_data; - delete[] read_buffer; + reader.close(); + boost::filesystem::remove(test_file_path); } -#include "../src/FileReader.hpp" - TEST_CASE("Test delimiter", "[BufferedFileReader]") { // Initialize data for testing - size_t test_data_size = 1L * 1024 * 1024; // 1MB - char* test_data = new char[test_data_size]; - std::srand(0); - for (size_t i = 0; i < test_data_size; ++i) { - test_data[i] = (char)('a' + (std::rand() % 26)); + size_t const test_data_size = 1L * 1024 * 1024 + 1; // 1MB + auto test_data_uniq_ptr = std::make_unique>(); + auto& test_data = *test_data_uniq_ptr; + for (size_t i = 0; i < test_data.size(); ++i) { + test_data[i] = static_cast('a' + (std::rand() % (cNumAlphabets))); } - std::string test_file_path{"BufferedFileReader.delimiter.test"}; - // write to test file + // Write to test file + std::string const test_file_path{"BufferedFileReader.delimiter.test"}; FileWriter file_writer; file_writer.open(test_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); - file_writer.write(test_data, test_data_size); + file_writer.write(test_data.data(), test_data_size); file_writer.close(); BufferedFileReader file_reader; @@ -227,16 +273,18 @@ TEST_CASE("Test delimiter", "[BufferedFileReader]") { ref_file_reader.open(test_file_path); std::string ref_string; - ErrorCode error_code = ErrorCode_Success; - char delimiter = (char)('a' + (std::rand() % 26)); + // Validate that a FileReader and a BufferedFileReader return the same + // strings (split by delimiters) + ErrorCode error_code{ErrorCode_Success}; + auto delimiter = (char)('a' + (std::rand() % (cNumAlphabets))); while (ErrorCode_EndOfFile != error_code) { error_code = ref_file_reader.try_read_to_delimiter(delimiter, true, false, ref_string); - auto error_code2 - = file_reader.try_read_to_delimiter(delimiter, true, false, test_string); + auto error_code2 = file_reader.try_read_to_delimiter(delimiter, true, false, test_string); REQUIRE(error_code2 == error_code); REQUIRE(test_string == ref_string); } ref_file_reader.close(); file_reader.close(); + boost::filesystem::remove(test_file_path); } From ebf9d4b7b9b72aecd0c6b8cb422298478d9fbcc8 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 11:26:50 -0400 Subject: [PATCH 106/121] East-const fixes. --- components/core/src/BufferReader.cpp | 4 ++-- components/core/src/BufferedFileReader.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/src/BufferReader.cpp b/components/core/src/BufferReader.cpp index 8c0fca813..3290c8ea5 100644 --- a/components/core/src/BufferReader.cpp +++ b/components/core/src/BufferReader.cpp @@ -65,8 +65,8 @@ auto BufferReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_byt } num_bytes_read = std::min(remaining_data_size, num_bytes_to_read); - const auto* copy_begin = m_internal_buf + m_internal_buf_pos; - const auto* copy_end = copy_begin + num_bytes_read; + auto const* copy_begin = m_internal_buf + m_internal_buf_pos; + auto const* copy_end = copy_begin + num_bytes_read; std::copy(copy_begin, copy_end, buf); m_internal_buf_pos += num_bytes_read; return ErrorCode_Success; diff --git a/components/core/src/BufferedFileReader.cpp b/components/core/src/BufferedFileReader.cpp index e9718fd3e..8daaebb17 100644 --- a/components/core/src/BufferedFileReader.cpp +++ b/components/core/src/BufferedFileReader.cpp @@ -138,7 +138,7 @@ auto BufferedFileReader::try_peek_buffered_data(char const*& buf, size_t& peek_s return ErrorCode_Success; } -void BufferedFileReader::peek_buffered_data(const char *& buf, size_t& peek_size) const { +void BufferedFileReader::peek_buffered_data(char const*& buf, size_t& peek_size) const { auto error_code = try_peek_buffered_data(buf, peek_size); if (ErrorCode_Success != error_code) { throw OperationFailed(error_code, __FILENAME__, __LINE__); From 492bd966d9477f44dbe5e3f559ffda3d2ac31e96 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 11:54:10 -0400 Subject: [PATCH 107/121] Fix macOS build issue. --- components/core/src/LibarchiveFileReader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/components/core/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp index 4dc963d9a..e5a7b42cc 100644 --- a/components/core/src/LibarchiveFileReader.cpp +++ b/components/core/src/LibarchiveFileReader.cpp @@ -226,7 +226,10 @@ void LibarchiveFileReader::peek_buffered_data(char const*& buf, size_t& buf_size // this method. Since peek is a best-effort method, this should be // sufficient for most callers. buf = m_nulls_for_peek.data(); - buf_size = std::min(m_nulls_for_peek.size(), m_data_block_pos_in_file - m_pos_in_file); + buf_size = std::min( + m_nulls_for_peek.size(), + static_cast(m_data_block_pos_in_file - m_pos_in_file) + ); } else { buf_size = m_data_block_length - m_pos_in_data_block; buf = static_cast(m_data_block); From e1787de1be32751284dfd0b4b7dbcb5a65f71448 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:44:56 -0400 Subject: [PATCH 108/121] Add missing include. --- components/core/tests/test-BufferedFileReader.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/core/tests/test-BufferedFileReader.cpp b/components/core/tests/test-BufferedFileReader.cpp index 8ae0ade68..5647b4939 100644 --- a/components/core/tests/test-BufferedFileReader.cpp +++ b/components/core/tests/test-BufferedFileReader.cpp @@ -1,3 +1,5 @@ +#include + #include #include From 5e8cb405e88078f72e7bee6f60a00793897e5213 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 14 Aug 2023 23:49:29 -0400 Subject: [PATCH 109/121] clangformat --- components/core/src/IrMessageParser.cpp | 7 +-- components/core/src/IrMessageParser.hpp | 4 +- components/core/src/ParsedIrMessage.cpp | 15 +++--- components/core/src/ParsedIrMessage.hpp | 8 ++- components/core/src/clp/FileCompressor.cpp | 53 ++++++++++++------- .../src/ffi/ir_stream/decoding_methods.cpp | 23 ++++---- .../src/ffi/ir_stream/decoding_methods.hpp | 25 ++++----- 7 files changed, 70 insertions(+), 65 deletions(-) diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp index e8f0fa2c9..ee9a83a5f 100644 --- a/components/core/src/IrMessageParser.cpp +++ b/components/core/src/IrMessageParser.cpp @@ -1,17 +1,12 @@ #include "IrMessageParser.hpp" -// Project headers +#include "../../../submodules/json/single_include/nlohmann/json.hpp" #include "BufferReader.hpp" #include "EncodedVariableInterpreter.hpp" #include "ffi/encoding_methods.hpp" #include "ffi/ir_stream/protocol_constants.hpp" - -// spdlog #include "spdlog/spdlog.h" -// json -#include "../../../submodules/json/single_include/nlohmann/json.hpp" - using ffi::cVariablePlaceholderEscapeCharacter; using ffi::eight_byte_encoded_variable_t; using ffi::four_byte_encoded_variable_t; diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp index 4809ea8ca..076372131 100644 --- a/components/core/src/IrMessageParser.hpp +++ b/components/core/src/IrMessageParser.hpp @@ -41,7 +41,7 @@ class IrMessageParser { auto get_msg_logtype_entry() -> LogTypeDictionaryEntry& { return m_msg.get_logtype_entry(); } [[nodiscard]] auto parse_next_encoded_message() -> bool; - static bool is_ir_encoded(size_t sequence_length, char const* data); + static auto is_ir_encoded(size_t sequence_length, char const* data) -> bool; private: [[nodiscard]] auto parse_next_four_bytes_message() -> bool; @@ -49,7 +49,7 @@ class IrMessageParser { [[nodiscard]] auto decode_json_preamble(std::string& json_metadata) -> bool; // member variables - bool m_is_four_bytes_encoded; + bool m_is_four_bytes_encoded{false}; epochtime_t m_reference_timestamp; TimestampPattern m_ts_pattern; ParsedIrMessage m_msg; diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp index a1ead4de3..0eb92667e 100644 --- a/components/core/src/ParsedIrMessage.cpp +++ b/components/core/src/ParsedIrMessage.cpp @@ -1,13 +1,7 @@ #include "ParsedIrMessage.hpp" -// C standard libraries - -// C++ standard libraries - -// Project headers #include "LogTypeDictionaryEntry.hpp" -// spdlog #include "spdlog/spdlog.h" using std::string; @@ -31,7 +25,8 @@ auto ParsedIrMessage::set_ts_pattern(TimestampPattern const* timestamp_pattern) m_ts_bytes = empty_msg.length(); } -auto ParsedIrMessage::append_to_logtype(string const& value, size_t begin_pos, size_t length) -> void { +auto ParsedIrMessage::append_to_logtype(string const& value, size_t begin_pos, size_t length) + -> void { m_logtype_entry.add_constant(value, begin_pos, length); m_orig_num_bytes += length; } @@ -55,13 +50,15 @@ auto ParsedIrMessage::add_dictionary_var(string const& dictionary_var) -> void { m_orig_num_bytes += dictionary_var.size(); } -auto ParsedIrMessage::add_encoded_integer(encoded_variable_t var, size_t orginal_size_in_bytes) -> void { +auto ParsedIrMessage::add_encoded_integer(encoded_variable_t var, size_t orginal_size_in_bytes) + -> void { m_variables.emplace_back(var); m_logtype_entry.add_int_var(); m_orig_num_bytes += orginal_size_in_bytes; } -auto ParsedIrMessage::add_encoded_float(encoded_variable_t var, size_t orginal_size_in_bytes) -> void { +auto ParsedIrMessage::add_encoded_float(encoded_variable_t var, size_t orginal_size_in_bytes) + -> void { m_variables.emplace_back(var); m_logtype_entry.add_float_var(); m_orig_num_bytes += orginal_size_in_bytes; diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp index b5fd28c5e..afd68d10b 100644 --- a/components/core/src/ParsedIrMessage.hpp +++ b/components/core/src/ParsedIrMessage.hpp @@ -1,13 +1,11 @@ -#ifndef ParsedIrMessage_HPP -#define ParsedIrMessage_HPP +#ifndef PARSEDIRMESSAGE_HPP +#define PARSEDIRMESSAGE_HPP -// C++ standard libraries #include #include #include #include -// Project headers #include "Defs.h" #include "LogTypeDictionaryEntry.hpp" #include "TimestampPattern.hpp" @@ -101,4 +99,4 @@ class ParsedIrMessage { size_t m_ts_bytes{0}; }; -#endif // ParsedIrMessage_HPP +#endif // PARSEDIRMESSAGE_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 04d94b772..9956f859a 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -196,21 +196,22 @@ namespace clp { close_file_and_append_to_segment(archive_writer); } - bool FileCompressor::try_compressing_as_ir (size_t target_data_size_of_dicts, - streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, - const std::string& path_for_compression, - group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, - ReaderInterface& reader) - { + bool FileCompressor::try_compressing_as_ir( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader + ) { // Construct the MessageParser which parse encoding type and metadata // as part of the construction process try { IrMessageParser ir_message_parser(reader); // Open compressed file - archive_writer.create_and_open_file(path_for_compression, group_id, - m_uuid_generator(), 0); + archive_writer + .create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); // Assume one encoded file only has one timestamp pattern archive_writer.change_ts_pattern(ir_message_parser.get_ts_pattern()); @@ -220,18 +221,30 @@ namespace clp { break; } if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { - split_file_and_archive(archive_user_config, path_for_compression, group_id, - ir_message_parser.get_ts_pattern(), archive_writer); + split_file_and_archive( + archive_user_config, + path_for_compression, + group_id, + ir_message_parser.get_ts_pattern(), + archive_writer + ); } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= - target_encoded_file_size) { - split_file(path_for_compression, group_id, ir_message_parser.get_ts_pattern(), - archive_writer); + target_encoded_file_size) + { + split_file( + path_for_compression, + group_id, + ir_message_parser.get_ts_pattern(), + archive_writer + ); } - const auto& parsed_msg = ir_message_parser.get_parsed_msg(); - archive_writer.write_ir_message(parsed_msg.get_ts(), - ir_message_parser.get_msg_logtype_entry(), - parsed_msg.get_vars(), - parsed_msg.get_orig_num_bytes()); + auto const& parsed_msg = ir_message_parser.get_parsed_msg(); + archive_writer.write_ir_message( + parsed_msg.get_ts(), + ir_message_parser.get_msg_logtype_entry(), + parsed_msg.get_vars(), + parsed_msg.get_orig_num_bytes() + ); } close_file_and_append_to_segment(archive_writer); return true; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 54446ec5d..b58b9f075 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -18,7 +18,8 @@ namespace { */ template auto is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) -> bool { - static_assert(is_same_v || is_same_v); + static_assert((is_same_v) + || (is_same_v)); if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort || tag == cProtocol::Payload::VarStrLenInt) { @@ -79,8 +80,8 @@ namespace { * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ - IRErrorCode parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { - size_t logtype_length; + auto parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) -> IRErrorCode { + size_t logtype_length{0}; if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { uint8_t length; if (false == decode_int(reader, length)) { @@ -119,8 +120,8 @@ namespace { * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough * data to decode */ - IRErrorCode - parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { + auto + parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) -> IRErrorCode { // Decode variable's length size_t var_length; if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { @@ -167,8 +168,8 @@ namespace { * to decode */ template - IRErrorCode - parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { + auto + parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) -> IRErrorCode { static_assert( (is_same_v || is_same_v) @@ -223,11 +224,11 @@ namespace { * to decode */ template - IRErrorCode generic_decode_next_message( + auto generic_decode_next_message( ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp - ) { + ) -> IRErrorCode { message.clear(); vector encoded_vars; @@ -292,11 +293,11 @@ namespace { * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * to decode */ - IRErrorCode read_metadata_info( + auto read_metadata_info( ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size - ) { + ) -> IRErrorCode { if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { return IRErrorCode_Incomplete_IR; } diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 3b71d5f1b..366d34fbd 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -46,7 +46,7 @@ class DecodingException : public TraceableException { * @return ErrorCode_Incomplete_IR if reader doesn't contain enough data to * decode */ -IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding); +auto get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) -> IRErrorCode; /** * Parse logtypes, dictionary variables and encoded variables @@ -63,13 +63,13 @@ IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encod * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data */ template -IRErrorCode generic_parse_tokens( +auto generic_parse_tokens( ReaderInterface& reader, std::string& logtype, std::vector& encoded_vars, std::vector& dict_vars, epoch_time_ms_t& timestamp -); +) -> IRErrorCode; /** * Decodes the message consists of the tokens and calls the given methods @@ -77,7 +77,7 @@ IRErrorCode generic_parse_tokens( * @tparam encoded_variable_t Type of the encoded variable * @tparam ConstantHandler Method to handle constants in the logtypes. * Signature: (const std::string&, size_t, size_t) -> void - * @tparam ConstantRemainderHandler Method to handle remaining constant in the + * @tparam ConstantRemainderHandler Method to handle the last constant in the * logtypes. Signature: (const std::string&, size_t, size_t) -> void * @tparam EncodedIntHandler Method to handle encoded integers. * Signature: (encoded_variable_t) -> void @@ -124,12 +124,12 @@ void generic_decode_message( * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to * decode */ -IRErrorCode decode_preamble( +auto decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size -); +) -> IRErrorCode; /** * Decodes the preamble for an IR stream. @@ -141,11 +141,11 @@ IRErrorCode decode_preamble( * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough * data to decode */ -IRErrorCode decode_preamble( +auto decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, std::vector& metadata -); +) -> IRErrorCode; namespace eight_byte_encoding { /** @@ -161,8 +161,9 @@ namespace eight_byte_encoding { * decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode - decode_next_message(ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp); + auto + decode_next_message(ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp) + -> IRErrorCode; } // namespace eight_byte_encoding namespace four_byte_encoding { @@ -179,11 +180,11 @@ namespace four_byte_encoding { * decode * @return ErrorCode_End_of_IR if the IR ends */ - IRErrorCode decode_next_message( + auto decode_next_message( ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp_delta - ); + ) -> IRErrorCode; } // namespace four_byte_encoding } // namespace ffi::ir_stream From 7c3dc03413cc31d1130711b869f696c47342ead9 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 15 Aug 2023 00:28:40 -0400 Subject: [PATCH 110/121] remove unused functions --- components/core/src/LogTypeDictionaryEntry.cpp | 8 -------- components/core/src/LogTypeDictionaryEntry.hpp | 10 ---------- 2 files changed, 18 deletions(-) diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index affda397c..f84f01e67 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -53,14 +53,6 @@ size_t LogTypeDictionaryEntry::get_data_size () const { m_ids_of_segments_containing_entry.size() * sizeof(segment_id_t); } -void LogTypeDictionaryEntry::set_logtype (std::string_view logtype) { - m_value = logtype; -} - -void LogTypeDictionaryEntry::set_var_positions(const std::vector& var_positions) { - m_var_positions = var_positions; -} - void LogTypeDictionaryEntry::add_constant (const string& value_containing_constant, size_t begin_pos, size_t length) { m_value.append(value_containing_constant, begin_pos, length); } diff --git a/components/core/src/LogTypeDictionaryEntry.hpp b/components/core/src/LogTypeDictionaryEntry.hpp index 53d518cdd..85a2aabde 100644 --- a/components/core/src/LogTypeDictionaryEntry.hpp +++ b/components/core/src/LogTypeDictionaryEntry.hpp @@ -86,16 +86,6 @@ class LogTypeDictionaryEntry : public DictionaryEntry { * @return Size of the data contained in this entry */ size_t get_data_size () const; - /** - * Sets the logtype from a given string_view - * @param value_containing_constant - */ - void set_logtype (std::string_view logtype); - - /** - * set_var_positions - */ - void set_var_positions(const std::vector& var_positions); /** * Adds a constant to the logtype From d45fd38ee19026a362e426576b76b5f80147f72d Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Wed, 16 Aug 2023 03:44:38 -0400 Subject: [PATCH 111/121] Undo formatting changes which should be they're own PR. --- .../src/ffi/ir_stream/decoding_methods.cpp | 627 +++++++++--------- .../src/ffi/ir_stream/decoding_methods.hpp | 19 +- 2 files changed, 330 insertions(+), 316 deletions(-) diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 6fcf453f9..f9c5a86e6 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -8,325 +8,344 @@ using std::string; using std::vector; namespace ffi::ir_stream { -namespace { - /** - * @tparam encoded_variable_t Type of the encoded variable - * @param tag - * @param is_encoded_var Returns true if tag is for an encoded variable (as - * opposed to a dictionary variable) - * @return Whether the tag is a variable tag - */ - template - auto is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) -> bool { - static_assert((is_same_v) - || (is_same_v)); - if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort - || tag == cProtocol::Payload::VarStrLenInt) - { - is_encoded_var = false; +/** + * @tparam encoded_variable_t Type of the encoded variable + * @param tag + * @param is_encoded_var Returns true if tag is for an encoded variable (as + * opposed to a dictionary variable) + * @return Whether the tag is a variable tag + */ +template +static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var); + +/** + * Decodes an integer from the given reader + * @tparam integer_t Type of the integer to decode + * @param reader + * @param value Returns the decoded integer + * @return true on success, false if the reader doesn't contain enough data to + * decode + */ +template +static bool decode_int(ReaderInterface& reader, integer_t& value); + +/** + * Decodes the next logtype string from the given reader + * @param reader + * @param encoded_tag + * @param logtype Returns the logtype string + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode + */ +static IRErrorCode +parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype); + +/** + * Decodes the next dictionary-type variable string from the given reader + * @param reader + * @param encoded_tag + * @param dict_var Returns the dictionary variable + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough data + * to decode + */ +static IRErrorCode +parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var); + +/** + * Parses the next timestamp from the given reader + * @tparam encoded_variable_t Type of the encoded variable + * @param reader + * @param encoded_tag + * @param ts Returns the timestamp delta if + * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if + * encoded_variable_t == eight_byte_encoded_variable_t + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode + */ +template +static IRErrorCode +parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts); + +/** + * Decodes the next encoded message from the given reader + * @tparam encoded_variable_t Type of the encoded variable + * @param reader + * @param message Returns the decoded message + * @param timestamp Returns the timestamp delta if + * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if + * encoded_variable_t == eight_byte_encoded_variable_t + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Decode_Error if the encoded message cannot be properly + * decoded + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode + */ +template +static IRErrorCode +generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp); + +/** + * Reads metadata information from the given reader + * @param reader + * @param metadata_type Returns the type of the metadata found in the IR + * @param metadata_pos Returns the starting position of the metadata in reader + * @param metadata_size Returns the size of the metadata written in the IR + * @return IRErrorCode_Success on success + * @return IRErrorCode_Corrupted_IR if reader contains invalid IR + * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to + * decode + */ +static IRErrorCode +read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size); + +template +static bool is_variable_tag(encoded_tag_t tag, bool& is_encoded_var) { + static_assert( + (is_same_v + || is_same_v) + ); + + if (tag == cProtocol::Payload::VarStrLenUByte || tag == cProtocol::Payload::VarStrLenUShort + || tag == cProtocol::Payload::VarStrLenInt) + { + is_encoded_var = false; + return true; + } + + if constexpr (is_same_v) { + if (tag == cProtocol::Payload::VarEightByteEncoding) { + is_encoded_var = true; return true; } - if constexpr (is_same_v) { - if (tag == cProtocol::Payload::VarEightByteEncoding) { - is_encoded_var = true; - return true; - } - } else { - if (tag == cProtocol::Payload::VarFourByteEncoding) { - is_encoded_var = true; - return true; - } + } else { + if (tag == cProtocol::Payload::VarFourByteEncoding) { + is_encoded_var = true; + return true; } + } + return false; +} + +template +static bool decode_int(ReaderInterface& reader, integer_t& value) { + integer_t value_little_endian; + if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { return false; } - /** - * Decodes an integer from the given reader - * @tparam integer_t Type of the integer to decode - * @param reader - * @param value Returns the decoded integer - * @return true on success, false if the reader doesn't contain enough data - * to decode - */ - template - bool decode_int(ReaderInterface& reader, integer_t& value) { - integer_t value_little_endian; - if (reader.try_read_numeric_value(value_little_endian) != ErrorCode_Success) { - return false; - } + constexpr auto read_size = sizeof(integer_t); + static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); + if constexpr (read_size == 1) { + value = value_little_endian; + } else if constexpr (read_size == 2) { + value = bswap_16(value_little_endian); + } else if constexpr (read_size == 4) { + value = bswap_32(value_little_endian); + } else if constexpr (read_size == 8) { + value = bswap_64(value_little_endian); + } + return true; +} - constexpr auto read_size = sizeof(integer_t); - static_assert(read_size == 1 || read_size == 2 || read_size == 4 || read_size == 8); - if constexpr (read_size == 1) { - value = value_little_endian; - } else if constexpr (read_size == 2) { - value = bswap_16(value_little_endian); - } else if constexpr (read_size == 4) { - value = bswap_32(value_little_endian); - } else if constexpr (read_size == 8) { - value = bswap_64(value_little_endian); +static IRErrorCode +parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) { + size_t logtype_length; + if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { + uint8_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; } - return true; + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + logtype_length = length; + } else { + return IRErrorCode_Corrupted_IR; } - /** - * Decodes the next logtype string from the given reader - * @param reader - * @param encoded_tag - * @param logtype Returns the logtype string - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - auto parse_logtype(ReaderInterface& reader, encoded_tag_t encoded_tag, string& logtype) -> IRErrorCode { - size_t logtype_length{0}; - if (encoded_tag == cProtocol::Payload::LogtypeStrLenUByte) { - uint8_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenUShort) { - uint16_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else if (encoded_tag == cProtocol::Payload::LogtypeStrLenInt) { - int32_t length; - if (false == decode_int(reader, length)) { - return IRErrorCode_Incomplete_IR; - } - logtype_length = length; - } else { - return IRErrorCode_Corrupted_IR; - } + if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { + return IRErrorCode_Incomplete_IR; + } + return IRErrorCode_Success; +} - if (ErrorCode_Success != reader.try_read_string(logtype_length, logtype)) { +static IRErrorCode +parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) { + // Decode variable's length + size_t var_length; + if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { + uint8_t length; + if (false == decode_int(reader, length)) { return IRErrorCode_Incomplete_IR; } - return IRErrorCode_Success; + var_length = length; + } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { + uint16_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { + int32_t length; + if (false == decode_int(reader, length)) { + return IRErrorCode_Incomplete_IR; + } + var_length = length; + } else { + return IRErrorCode_Corrupted_IR; } - /** - * Decodes the next dictionary-type variable string from the given reader - * @param reader - * @param encoded_tag - * @param dict_var Returns the dictionary variable - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if input buffer doesn't contain enough - * data to decode - */ - auto - parse_dictionary_var(ReaderInterface& reader, encoded_tag_t encoded_tag, string& dict_var) -> IRErrorCode { - // Decode variable's length - size_t var_length; - if (cProtocol::Payload::VarStrLenUByte == encoded_tag) { - uint8_t length; - if (false == decode_int(reader, length)) { + // Read the dictionary variable + if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { + return IRErrorCode_Incomplete_IR; + } + + return IRErrorCode_Success; +} + +template +static IRErrorCode +parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) { + static_assert( + (is_same_v + || is_same_v) + ); + + if constexpr (is_same_v) { + if (cProtocol::Payload::TimestampVal != encoded_tag) { + return IRErrorCode_Corrupted_IR; + } + if (false == decode_int(reader, ts)) { + return IRErrorCode_Incomplete_IR; + } + } else { + if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { + int8_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenUShort == encoded_tag) { - uint16_t length; - if (false == decode_int(reader, length)) { + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { + int16_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; - } else if (cProtocol::Payload::VarStrLenInt == encoded_tag) { - int32_t length; - if (false == decode_int(reader, length)) { + ts = ts_delta; + } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { + int32_t ts_delta; + if (false == decode_int(reader, ts_delta)) { return IRErrorCode_Incomplete_IR; } - var_length = length; + ts = ts_delta; } else { return IRErrorCode_Corrupted_IR; } + } + return IRErrorCode_Success; +} - // Read the dictionary variable - if (ErrorCode_Success != reader.try_read_string(var_length, dict_var)) { - return IRErrorCode_Incomplete_IR; - } - - return IRErrorCode_Success; +template +static IRErrorCode +generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { + message.clear(); + + vector encoded_vars; + vector dict_vars; + string logtype; + if (auto error_code + = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); + IRErrorCode_Success != error_code) + { + return error_code; } - /** - * Parses the next timestamp from the given reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param encoded_tag - * @param ts Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - template - auto - parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_ms_t& ts) -> IRErrorCode { - static_assert( - (is_same_v - || is_same_v) + // constant handler + auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { + message.append(value, begin_pos, length); + }; + + // constant handler + auto constant_remainder_handler = [&message](string const& value, size_t begin_pos) { + message.append(value, begin_pos); + }; + + // encoded int handler + auto encoded_int_handler = [&message](encoded_variable_t value) { + message.append(decode_integer_var(value)); + }; + + // encoded float handler + auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { + message.append(decode_float_var(encoded_float)); + }; + + // dict var handler + auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; + + try { + generic_decode_message( + logtype, + encoded_vars, + dict_vars, + constant_handler, + constant_remainder_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler ); - - if constexpr (is_same_v) { - if (cProtocol::Payload::TimestampVal != encoded_tag) { - return IRErrorCode_Corrupted_IR; - } - if (false == decode_int(reader, ts)) { - return IRErrorCode_Incomplete_IR; - } - } else { - if (cProtocol::Payload::TimestampDeltaByte == encoded_tag) { - int8_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaShort == encoded_tag) { - int16_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else if (cProtocol::Payload::TimestampDeltaInt == encoded_tag) { - int32_t ts_delta; - if (false == decode_int(reader, ts_delta)) { - return IRErrorCode_Incomplete_IR; - } - ts = ts_delta; - } else { - return IRErrorCode_Corrupted_IR; - } - } - return IRErrorCode_Success; + } catch (DecodingException const& e) { + return IRErrorCode_Decode_Error; } + return IRErrorCode_Success; +} - /** - * Decodes the next encoded message from the given reader - * @tparam encoded_variable_t Type of the encoded variable - * @param reader - * @param message Returns the decoded message - * @param timestamp Returns the timestamp delta if - * encoded_variable_t == four_byte_encoded_variable_t or the actual - * timestamp if encoded_variable_t == eight_byte_encoded_variable_t - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Decode_Error if the encoded message cannot be - * properly decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - template - auto generic_decode_next_message( - ReaderInterface& reader, - string& message, - epoch_time_ms_t& timestamp - ) -> IRErrorCode { - message.clear(); - - vector encoded_vars; - vector dict_vars; - string logtype; - if (auto error_code - = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); - IRErrorCode_Success != error_code) - { - return error_code; - } - - // constant handler - auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { - message.append(value, begin_pos, length); - }; - - // constant handler - auto constant_remainder_handler = [&message](string const& value, size_t begin_pos) { - message.append(value, begin_pos); - }; - - // encoded int handler - auto encoded_int_handler = [&message](encoded_variable_t value) { - message.append(decode_integer_var(value)); - }; - - // encoded float handler - auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { - message.append(decode_float_var(encoded_float)); - }; - - // dict var handler - auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; - - try { - generic_decode_message( - logtype, - encoded_vars, - dict_vars, - constant_handler, - constant_remainder_handler, - encoded_int_handler, - encoded_float_handler, - dict_var_handler - ); - } catch (DecodingException const& e) { - return IRErrorCode_Decode_Error; - } - return IRErrorCode_Success; +static IRErrorCode +read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16_t& metadata_size) { + if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { + return IRErrorCode_Incomplete_IR; } - /** - * Reads metadata information from the given reader - * @param reader - * @param metadata_type Returns the type of the metadata found in the IR - * @param metadata_pos Returns the starting position of the metadata in - * reader - * @param metadata_size Returns the size of the metadata written in the IR - * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data - * to decode - */ - auto read_metadata_info( - ReaderInterface& reader, - encoded_tag_t& metadata_type, - uint16_t& metadata_size - ) -> IRErrorCode { - if (ErrorCode_Success != reader.try_read_numeric_value(metadata_type)) { - return IRErrorCode_Incomplete_IR; - } - - // Read metadata length - encoded_tag_t encoded_tag; - if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { - return IRErrorCode_Incomplete_IR; - } - switch (encoded_tag) { - case cProtocol::Metadata::LengthUByte: - uint8_t ubyte_res; - if (false == decode_int(reader, ubyte_res)) { - return IRErrorCode_Incomplete_IR; - } - metadata_size = ubyte_res; - break; - case cProtocol::Metadata::LengthUShort: - uint16_t ushort_res; - if (false == decode_int(reader, ushort_res)) { - return IRErrorCode_Incomplete_IR; - } - metadata_size = ushort_res; - break; - default: - return IRErrorCode_Corrupted_IR; - } - return IRErrorCode_Success; + // Read metadata length + encoded_tag_t encoded_tag; + if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { + return IRErrorCode_Incomplete_IR; } -} // namespace + switch (encoded_tag) { + case cProtocol::Metadata::LengthUByte: + uint8_t ubyte_res; + if (false == decode_int(reader, ubyte_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ubyte_res; + break; + case cProtocol::Metadata::LengthUShort: + uint16_t ushort_res; + if (false == decode_int(reader, ushort_res)) { + return IRErrorCode_Incomplete_IR; + } + metadata_size = ushort_res; + break; + default: + return IRErrorCode_Corrupted_IR; + } + return IRErrorCode_Success; +} template auto generic_parse_tokens( @@ -387,23 +406,17 @@ auto generic_parse_tokens( return IRErrorCode_Success; } -auto get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) -> IRErrorCode { - std::vector buffer(cProtocol::MagicNumberLength, '\0'); - auto error_code = reader.try_read_exact_length(buffer.data(), cProtocol::MagicNumberLength); +IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) { + char buffer[cProtocol::MagicNumberLength]; + auto error_code = reader.try_read_exact_length(buffer, cProtocol::MagicNumberLength); if (error_code != ErrorCode_Success) { return IRErrorCode_Incomplete_IR; } - if (0 - == memcmp( - buffer.data(), - cProtocol::FourByteEncodingMagicNumber, - cProtocol::MagicNumberLength - )) - { + if (0 == memcmp(buffer, cProtocol::FourByteEncodingMagicNumber, cProtocol::MagicNumberLength)) { is_four_bytes_encoding = true; } else if ((0 == memcmp( - buffer.data(), + buffer, cProtocol::EightByteEncodingMagicNumber, cProtocol::MagicNumberLength ))) @@ -415,12 +428,12 @@ auto get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) -> return IRErrorCode_Success; } -auto decode_preamble( +IRErrorCode decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size -) -> IRErrorCode { +) { if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) { @@ -433,11 +446,11 @@ auto decode_preamble( return IRErrorCode_Success; } -auto decode_preamble( +IRErrorCode decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, std::vector& metadata -) -> IRErrorCode { +) { uint16_t metadata_size{0}; if (auto error_code = read_metadata_info(reader, metadata_type, metadata_size); error_code != IRErrorCode_Success) @@ -457,9 +470,11 @@ auto decode_preamble( } namespace four_byte_encoding { - auto - decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp_delta) - -> IRErrorCode { + IRErrorCode decode_next_message( + ReaderInterface& reader, + string& message, + epoch_time_ms_t& timestamp_delta + ) { return generic_decode_next_message( reader, message, @@ -469,8 +484,8 @@ namespace four_byte_encoding { } // namespace four_byte_encoding namespace eight_byte_encoding { - auto decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) - -> IRErrorCode { + IRErrorCode + decode_next_message(ReaderInterface& reader, string& message, epoch_time_ms_t& timestamp) { return generic_decode_next_message( reader, message, diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 366d34fbd..97e991d17 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -46,7 +46,7 @@ class DecodingException : public TraceableException { * @return ErrorCode_Incomplete_IR if reader doesn't contain enough data to * decode */ -auto get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding) -> IRErrorCode; +IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding); /** * Parse logtypes, dictionary variables and encoded variables @@ -124,12 +124,12 @@ void generic_decode_message( * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to * decode */ -auto decode_preamble( +IRErrorCode decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, size_t& metadata_pos, uint16_t& metadata_size -) -> IRErrorCode; +); /** * Decodes the preamble for an IR stream. @@ -141,11 +141,11 @@ auto decode_preamble( * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough * data to decode */ -auto decode_preamble( +IRErrorCode decode_preamble( ReaderInterface& reader, encoded_tag_t& metadata_type, std::vector& metadata -) -> IRErrorCode; +); namespace eight_byte_encoding { /** @@ -161,9 +161,8 @@ namespace eight_byte_encoding { * decode * @return ErrorCode_End_of_IR if the IR ends */ - auto - decode_next_message(ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp) - -> IRErrorCode; + IRErrorCode + decode_next_message(ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp); } // namespace eight_byte_encoding namespace four_byte_encoding { @@ -180,11 +179,11 @@ namespace four_byte_encoding { * decode * @return ErrorCode_End_of_IR if the IR ends */ - auto decode_next_message( + IRErrorCode decode_next_message( ReaderInterface& reader, std::string& message, epoch_time_ms_t& timestamp_delta - ) -> IRErrorCode; + ); } // namespace four_byte_encoding } // namespace ffi::ir_stream From dd91a640ccb3b1092e798c977339944b8c46c9a2 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Wed, 16 Aug 2023 06:17:59 -0400 Subject: [PATCH 112/121] Fix includes --- components/core/src/IrMessageParser.cpp | 5 +++-- components/core/src/IrMessageParser.hpp | 6 ------ components/core/src/ParsedIrMessage.cpp | 4 ++-- components/core/src/clp/FileCompressor.hpp | 4 ++-- components/core/src/ffi/ir_stream/decoding_methods.tpp | 2 -- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp index ee9a83a5f..515e2fc3b 100644 --- a/components/core/src/IrMessageParser.cpp +++ b/components/core/src/IrMessageParser.cpp @@ -1,11 +1,12 @@ #include "IrMessageParser.hpp" -#include "../../../submodules/json/single_include/nlohmann/json.hpp" +#include +#include + #include "BufferReader.hpp" #include "EncodedVariableInterpreter.hpp" #include "ffi/encoding_methods.hpp" #include "ffi/ir_stream/protocol_constants.hpp" -#include "spdlog/spdlog.h" using ffi::cVariablePlaceholderEscapeCharacter; using ffi::eight_byte_encoded_variable_t; diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp index 076372131..eb0d5b568 100644 --- a/components/core/src/IrMessageParser.hpp +++ b/components/core/src/IrMessageParser.hpp @@ -1,12 +1,6 @@ - #ifndef IrMessageParser_HPP #define IrMessageParser_HPP -// C standard libraries - -// C++ standard libraries - -// Project headers #include "ffi/ir_stream/decoding_methods.hpp" #include "ParsedIrMessage.hpp" #include "TraceableException.hpp" diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp index 0eb92667e..14561afb3 100644 --- a/components/core/src/ParsedIrMessage.cpp +++ b/components/core/src/ParsedIrMessage.cpp @@ -1,8 +1,8 @@ #include "ParsedIrMessage.hpp" -#include "LogTypeDictionaryEntry.hpp" +#include -#include "spdlog/spdlog.h" +#include "LogTypeDictionaryEntry.hpp" using std::string; diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index 882aaefb0..936972a58 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -6,14 +6,14 @@ // Project headers #include "../BufferedFileReader.hpp" +#include "../compressor_frontend/LogParser.hpp" #include "../LibarchiveFileReader.hpp" #include "../LibarchiveReader.hpp" #include "../MessageParser.hpp" -#include "../ParsedMessage.hpp" #include "../ParsedIrMessage.hpp" +#include "../ParsedMessage.hpp" #include "../streaming_archive/writer/Archive.hpp" #include "FileToCompress.hpp" -#include "../compressor_frontend/LogParser.hpp" namespace clp { constexpr size_t cUtf8ValidationBufCapacity = 4096; diff --git a/components/core/src/ffi/ir_stream/decoding_methods.tpp b/components/core/src/ffi/ir_stream/decoding_methods.tpp index 3f664ec17..f40a3c6c6 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.tpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.tpp @@ -1,11 +1,9 @@ #ifndef FFI_IR_STREAM_DECODING_METHODS_TPP #define FFI_IR_STREAM_DECODING_METHODS_TPP -// C++ standard libraries #include #include -// Project headers #include "../../ReaderInterface.hpp" #include "../encoding_methods.hpp" #include "decoding_methods.hpp" From 1e45fa6ed4b1f40b5c4b9e3fde7bf80ed7900934 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 16 Aug 2023 11:54:17 -0400 Subject: [PATCH 113/121] another small clean up --- components/core/src/IrMessageParser.cpp | 50 +++----- components/core/src/IrMessageParser.hpp | 2 +- components/core/src/ParsedIrMessage.cpp | 38 +++--- components/core/src/ParsedIrMessage.hpp | 10 +- components/core/src/clp/FileCompressor.cpp | 115 +++++++++--------- components/core/src/clp/FileCompressor.hpp | 4 - .../src/streaming_archive/writer/Archive.cpp | 22 ++-- .../src/streaming_archive/writer/Archive.hpp | 9 +- 8 files changed, 121 insertions(+), 129 deletions(-) diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp index ee9a83a5f..a99b80323 100644 --- a/components/core/src/IrMessageParser.cpp +++ b/components/core/src/IrMessageParser.cpp @@ -16,25 +16,6 @@ using ffi::VariablePlaceholder; using std::string; using std::vector; -namespace { -/** - * Decodes Ir header from the reader and return its encoding type by reference - * @param reader - * @param is_four_bytes_encoded Returns the encoding type - * or Ir header that can't be properly decoded - */ -[[nodiscard]] auto decode_ir_magic_number(ReaderInterface& reader, bool& is_four_bytes_encoded) - -> bool { - // Note. On failure, this method doesn't recover file pos. - if (ffi::ir_stream::IRErrorCode_Success - != ffi::ir_stream::get_encoding_type(reader, is_four_bytes_encoded)) - { - return false; - } - return true; -} -} // namespace - /** * Constructs the class by setting the internal reader, parsing the metadata * and initializing variable based on the metadata @@ -43,7 +24,8 @@ namespace { * or IR data that can't be properly decoded */ IrMessageParser::IrMessageParser(ReaderInterface& reader) : m_reader(reader) { - if (false == decode_ir_magic_number(m_reader, m_is_four_bytes_encoded)) { + if (ffi::ir_stream::IRErrorCode_Success + != ffi::ir_stream::get_encoding_type(reader, m_is_four_bytes_encoded)) { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } @@ -86,6 +68,20 @@ auto IrMessageParser::parse_next_encoded_message() -> bool { return parse_next_eight_bytes_message(); } +auto IrMessageParser::is_ir_encoded(size_t sequence_length, char const* data) -> bool { + if (sequence_length < MagicNumberLength) { + return false; + } + bool is_four_bytes_encoded{false}; + BufferReader encoding_data(data, MagicNumberLength); + if (ffi::ir_stream::IRErrorCode_Success + != ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) + { + return false; + } + return true; +} + auto IrMessageParser::parse_next_eight_bytes_message() -> bool { m_msg.clear(); @@ -254,17 +250,3 @@ auto IrMessageParser::decode_json_preamble(std::string& json_metadata) -> bool { return true; } - -auto IrMessageParser::is_ir_encoded(size_t sequence_length, char const* data) -> bool { - if (sequence_length < MagicNumberLength) { - return false; - } - bool is_four_bytes_encoded{false}; - BufferReader encoding_data(data, MagicNumberLength); - if (ffi::ir_stream::IRErrorCode_Success - != ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) - { - return false; - } - return true; -} diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp index 076372131..d695c75e2 100644 --- a/components/core/src/IrMessageParser.hpp +++ b/components/core/src/IrMessageParser.hpp @@ -34,7 +34,7 @@ class IrMessageParser { explicit IrMessageParser(ReaderInterface& reader); // Methods - auto get_ts_pattern() -> TimestampPattern* { return &m_ts_pattern; } + auto get_ts_pattern() const -> TimestampPattern const* { return &m_ts_pattern; } [[nodiscard]] auto get_parsed_msg() const -> ParsedIrMessage const& { return m_msg; } diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp index 0eb92667e..f0831c3db 100644 --- a/components/core/src/ParsedIrMessage.cpp +++ b/components/core/src/ParsedIrMessage.cpp @@ -6,11 +6,17 @@ using std::string; -auto ParsedIrMessage::set_ts(epochtime_t ts) -> void { - m_ts = ts; - if (ts != 0) { - m_orig_num_bytes += m_ts_bytes; - } +auto ParsedIrMessage::clear() -> void { + m_ts_patt = nullptr; + m_ts_bytes = 0; + clear_except_ts_patt(); +} + +auto ParsedIrMessage::clear_except_ts_patt() -> void { + m_variables.clear(); + m_orig_num_bytes = 0; + m_ts = 0; + m_logtype_entry.clear(); } auto ParsedIrMessage::set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void { @@ -25,25 +31,21 @@ auto ParsedIrMessage::set_ts_pattern(TimestampPattern const* timestamp_pattern) m_ts_bytes = empty_msg.length(); } +auto ParsedIrMessage::set_ts(epochtime_t ts) -> void { + m_ts = ts; + if (ts != 0) { + // instead of converting the ts back to string to get the size, we use + // a previously calculated approximated value + m_orig_num_bytes += m_ts_bytes; + } +} + auto ParsedIrMessage::append_to_logtype(string const& value, size_t begin_pos, size_t length) -> void { m_logtype_entry.add_constant(value, begin_pos, length); m_orig_num_bytes += length; } -auto ParsedIrMessage::clear() -> void { - m_ts_patt = nullptr; - m_ts_bytes = 0; - clear_except_ts_patt(); -} - -auto ParsedIrMessage::clear_except_ts_patt() -> void { - m_variables.clear(); - m_orig_num_bytes = 0; - m_ts = 0; - m_logtype_entry.clear(); -} - auto ParsedIrMessage::add_dictionary_var(string const& dictionary_var) -> void { m_variables.emplace_back(dictionary_var); m_logtype_entry.add_dictionary_var(); diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp index afd68d10b..683f80cad 100644 --- a/components/core/src/ParsedIrMessage.hpp +++ b/components/core/src/ParsedIrMessage.hpp @@ -68,19 +68,21 @@ class ParsedIrMessage { // Methods auto clear() -> void; + auto clear_except_ts_patt() -> void; - // setter - auto set_ts(epochtime_t ts) -> void; auto set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void; - // note, this logtype is already escaped + auto set_ts(epochtime_t ts) -> void; + auto append_to_logtype(std::string const& value, size_t begin_pos, size_t length) -> void; + auto add_encoded_integer(encoded_variable_t var, size_t original_size_in_bytes) -> void; + auto add_encoded_float(encoded_variable_t var, size_t original_size_in_bytes) -> void; + auto add_dictionary_var(std::string const& dictionary_var) -> void; - // getter [[nodiscard]] auto get_ts() const -> epochtime_t { return m_ts; } auto get_logtype_entry() -> LogTypeDictionaryEntry& { return m_logtype_entry; } diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 9956f859a..768634755 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -196,63 +196,6 @@ namespace clp { close_file_and_append_to_segment(archive_writer); } - bool FileCompressor::try_compressing_as_ir( - size_t target_data_size_of_dicts, - streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, - std::string const& path_for_compression, - group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, - ReaderInterface& reader - ) { - // Construct the MessageParser which parse encoding type and metadata - // as part of the construction process - try { - IrMessageParser ir_message_parser(reader); - // Open compressed file - archive_writer - .create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - - // Assume one encoded file only has one timestamp pattern - archive_writer.change_ts_pattern(ir_message_parser.get_ts_pattern()); - - while (true) { - if (false == ir_message_parser.parse_next_encoded_message()) { - break; - } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { - split_file_and_archive( - archive_user_config, - path_for_compression, - group_id, - ir_message_parser.get_ts_pattern(), - archive_writer - ); - } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= - target_encoded_file_size) - { - split_file( - path_for_compression, - group_id, - ir_message_parser.get_ts_pattern(), - archive_writer - ); - } - auto const& parsed_msg = ir_message_parser.get_parsed_msg(); - archive_writer.write_ir_message( - parsed_msg.get_ts(), - ir_message_parser.get_msg_logtype_entry(), - parsed_msg.get_vars(), - parsed_msg.get_orig_num_bytes() - ); - } - close_file_and_append_to_segment(archive_writer); - return true; - } catch (TraceableException& e) { - return false; - } - } - bool FileCompressor::try_compressing_as_archive (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, size_t target_encoded_file_size, const FileToCompress& file_to_compress, streaming_archive::writer::Archive& archive_writer, bool use_heuristic) @@ -379,4 +322,62 @@ namespace clp { return succeeded; } + + bool FileCompressor::try_compressing_as_ir( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader + ) { + try { + // Construct the IrMessageParser which parse encoding type and + // metadata as part of the construction process + IrMessageParser ir_message_parser(reader); + + // Open compressed file + archive_writer + .create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); + + // Assume one encoded file only has one timestamp pattern + archive_writer.change_ts_pattern(ir_message_parser.get_ts_pattern()); + + while (true) { + if (false == ir_message_parser.parse_next_encoded_message()) { + break; + } + if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + split_file_and_archive( + archive_user_config, + path_for_compression, + group_id, + ir_message_parser.get_ts_pattern(), + archive_writer + ); + } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= + target_encoded_file_size) + { + split_file( + path_for_compression, + group_id, + ir_message_parser.get_ts_pattern(), + archive_writer + ); + } + auto const& parsed_msg = ir_message_parser.get_parsed_msg(); + archive_writer.write_ir_message( + parsed_msg.get_ts(), + ir_message_parser.get_msg_logtype_entry(), + parsed_msg.get_vars(), + parsed_msg.get_orig_num_bytes() + ); + } + close_file_and_append_to_segment(archive_writer); + return true; + } catch (TraceableException& e) { + return false; + } + } } diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index 882aaefb0..d59e63193 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -1,10 +1,8 @@ #ifndef CLP_FILECOMPRESSOR_HPP #define CLP_FILECOMPRESSOR_HPP -// Boost libraries #include -// Project headers #include "../BufferedFileReader.hpp" #include "../LibarchiveFileReader.hpp" #include "../LibarchiveReader.hpp" @@ -16,8 +14,6 @@ #include "../compressor_frontend/LogParser.hpp" namespace clp { - constexpr size_t cUtf8ValidationBufCapacity = 4096; - /** * Class to parse and compress a file into a streaming archive */ diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index 3100602c0..fe30ddee8 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -269,10 +269,12 @@ namespace streaming_archive::writer { } } - void Archive::write_ir_message (epochtime_t timestamp, - LogTypeDictionaryEntry& logtype_entry, - const std::vector& variables, - size_t num_uncompressed_bytes) { + void Archive::write_ir_message( + epochtime_t timestamp, + LogTypeDictionaryEntry& logtype_entry, + std::vector const& variables, + size_t num_uncompressed_bytes + ) { // Encode logtype logtype_dictionary_id_t logtype_id; m_logtype_dict.add_entry(logtype_entry, logtype_id); @@ -280,7 +282,7 @@ namespace streaming_archive::writer { vector encoded_vars; vector var_ids; // Encode variable base on type - for (const auto& var : variables) { + for (auto const& var : variables) { if (var.type() == ParsedIrMessage::VariableType::EncodedVar) { encoded_vars.push_back(var.get_encoded_var()); } else if (var.type() == ParsedIrMessage::VariableType::DictVar) { @@ -293,8 +295,13 @@ namespace streaming_archive::writer { } } - m_file->write_encoded_msg(timestamp, logtype_id, encoded_vars, - var_ids, num_uncompressed_bytes); + m_file->write_encoded_msg( + timestamp, + logtype_id, + encoded_vars, + var_ids, + num_uncompressed_bytes + ); // Update segment indices if (m_file->has_ts_pattern()) { @@ -304,7 +311,6 @@ namespace streaming_archive::writer { m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); } - } void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index 4a0f753b8..d71dc6fcf 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -138,9 +138,12 @@ namespace streaming_archive { namespace writer { * @param num_uncompressed_bytes * @throw FileWriter::OperationFailed if any write fails */ - void write_ir_message (epochtime_t timestamp, LogTypeDictionaryEntry& logtype_entry, - const std::vector& variables, - size_t num_uncompressed_bytes); + void write_ir_message( + epochtime_t timestamp, + LogTypeDictionaryEntry& logtype_entry, + std::vector const& variables, + size_t num_uncompressed_bytes + ); /** * Encodes and writes a message to the given file using schema file * @param file From 788bf36547d217fe9ec9e496d0d2dca6354a4ffb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 16 Aug 2023 12:12:55 -0400 Subject: [PATCH 114/121] small fixes --- components/core/src/ffi/ir_stream/decoding_methods.cpp | 5 ----- components/core/src/ffi/ir_stream/decoding_methods.hpp | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index f9c5a86e6..790a5ca6f 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -275,27 +275,22 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time return error_code; } - // constant handler auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { message.append(value, begin_pos, length); }; - // constant handler auto constant_remainder_handler = [&message](string const& value, size_t begin_pos) { message.append(value, begin_pos); }; - // encoded int handler auto encoded_int_handler = [&message](encoded_variable_t value) { message.append(decode_integer_var(value)); }; - // encoded float handler auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { message.append(decode_float_var(encoded_float)); }; - // dict var handler auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; try { diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 97e991d17..37ab8c93f 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -78,7 +78,7 @@ auto generic_parse_tokens( * @tparam ConstantHandler Method to handle constants in the logtypes. * Signature: (const std::string&, size_t, size_t) -> void * @tparam ConstantRemainderHandler Method to handle the last constant in the - * logtypes. Signature: (const std::string&, size_t, size_t) -> void + * logtypes. Signature: (const std::string&, size_t) -> void * @tparam EncodedIntHandler Method to handle encoded integers. * Signature: (encoded_variable_t) -> void * @tparam EncodedFloatHandler Method to handle encoded float. From 0629ea07c642ba83b746a9972ec9a761640ffcb7 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Aug 2023 07:38:02 -0400 Subject: [PATCH 115/121] Refactoring/restructuring: - Replace IrMessageParser with LogEventDeserializer and ParsedIrMessage with LogEvent - Rewrite classes using RAII and avoid exceptions to make embedded development smoother - Pass IR log event through to writer::Archive to stay consistent with the raw log event encoding path - Deduplicate code --- components/core/CMakeLists.txt | 52 ++-- .../core/src/EncodedVariableInterpreter.cpp | 174 ++++++++---- .../core/src/EncodedVariableInterpreter.hpp | 70 ++++- components/core/src/IrMessageParser.cpp | 253 ------------------ components/core/src/IrMessageParser.hpp | 53 ---- components/core/src/ParsedIrMessage.cpp | 67 ----- components/core/src/ParsedIrMessage.hpp | 104 ------- components/core/src/clp/FileCompressor.cpp | 139 ++++++---- components/core/src/clp/FileCompressor.hpp | 54 +++- components/core/src/clp/FileCompressor.tpp | 56 ++++ components/core/src/ffi/encoding_methods.cpp | 24 ++ components/core/src/ffi/encoding_methods.hpp | 42 +++ components/core/src/ffi/encoding_methods.tpp | 36 ++- .../src/ffi/ir_stream/decoding_methods.cpp | 17 +- .../src/ffi/ir_stream/decoding_methods.hpp | 7 +- .../src/ffi/ir_stream/decoding_methods.tpp | 17 +- components/core/src/ir/LogEvent.hpp | 52 ++++ .../core/src/ir/LogEventDeserializer.hpp | 91 +++++++ .../core/src/ir/LogEventDeserializer.tpp | 110 ++++++++ components/core/src/ir/utils.cpp | 13 + components/core/src/ir/utils.hpp | 15 ++ .../src/streaming_archive/writer/Archive.cpp | 44 --- .../src/streaming_archive/writer/Archive.hpp | 22 +- .../src/streaming_archive/writer/Archive.tpp | 47 ++++ components/core/src/type_utils.hpp | 14 +- 25 files changed, 864 insertions(+), 709 deletions(-) delete mode 100644 components/core/src/IrMessageParser.cpp delete mode 100644 components/core/src/IrMessageParser.hpp delete mode 100644 components/core/src/ParsedIrMessage.cpp delete mode 100644 components/core/src/ParsedIrMessage.hpp create mode 100644 components/core/src/clp/FileCompressor.tpp create mode 100644 components/core/src/ir/LogEvent.hpp create mode 100644 components/core/src/ir/LogEventDeserializer.hpp create mode 100644 components/core/src/ir/LogEventDeserializer.tpp create mode 100644 components/core/src/ir/utils.cpp create mode 100644 components/core/src/ir/utils.hpp create mode 100644 components/core/src/streaming_archive/writer/Archive.tpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index b30033cfa..7dc34e654 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -181,6 +181,7 @@ set(SOURCE_FILES_clp src/clp/decompression.hpp src/clp/FileCompressor.cpp src/clp/FileCompressor.hpp + src/clp/FileCompressor.tpp src/clp/FileDecompressor.cpp src/clp/FileDecompressor.hpp src/clp/FileToCompress.cpp @@ -191,15 +192,6 @@ set(SOURCE_FILES_clp src/clp/StructuredFileToCompress.hpp src/clp/utils.cpp src/clp/utils.hpp - src/ffi/encoding_methods.cpp - src/ffi/encoding_methods.hpp - src/ffi/encoding_methods.tpp - src/ffi/ir_stream/byteswap.hpp - src/ffi/ir_stream/decoding_methods.cpp - src/ffi/ir_stream/decoding_methods.hpp - src/ffi/ir_stream/decoding_methods.tpp - src/ffi/ir_stream/encoding_methods.cpp - src/ffi/ir_stream/encoding_methods.hpp src/compressor_frontend/Constants.hpp src/compressor_frontend/finite_automata/RegexAST.hpp src/compressor_frontend/finite_automata/RegexAST.tpp @@ -236,6 +228,15 @@ set(SOURCE_FILES_clp src/EncodedVariableInterpreter.cpp src/EncodedVariableInterpreter.hpp src/ErrorCode.hpp + src/ffi/encoding_methods.cpp + src/ffi/encoding_methods.hpp + src/ffi/encoding_methods.tpp + src/ffi/ir_stream/byteswap.hpp + src/ffi/ir_stream/decoding_methods.cpp + src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/decoding_methods.tpp + src/ffi/ir_stream/encoding_methods.cpp + src/ffi/ir_stream/encoding_methods.hpp src/FileReader.cpp src/FileReader.hpp src/FileWriter.cpp @@ -248,8 +249,11 @@ set(SOURCE_FILES_clp src/GlobalMySQLMetadataDB.hpp src/GlobalSQLiteMetadataDB.cpp src/GlobalSQLiteMetadataDB.hpp - src/IrMessageParser.cpp - src/IrMessageParser.hpp + src/ir/LogEvent.hpp + src/ir/LogEventDeserializer.hpp + src/ir/LogEventDeserializer.tpp + src/ir/utils.cpp + src/ir/utils.hpp src/LibarchiveFileReader.cpp src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp @@ -273,8 +277,6 @@ set(SOURCE_FILES_clp src/PageAllocatedVector.hpp src/ParsedMessage.cpp src/ParsedMessage.hpp - src/ParsedIrMessage.cpp - src/ParsedIrMessage.hpp src/Platform.hpp src/Profiler.cpp src/Profiler.hpp @@ -306,6 +308,7 @@ set(SOURCE_FILES_clp src/streaming_archive/reader/SegmentManager.hpp src/streaming_archive/writer/Archive.cpp src/streaming_archive/writer/Archive.hpp + src/streaming_archive/writer/Archive.tpp src/streaming_archive/writer/File.cpp src/streaming_archive/writer/File.hpp src/streaming_archive/writer/Segment.cpp @@ -410,6 +413,12 @@ set(SOURCE_FILES_clg src/EncodedVariableInterpreter.cpp src/EncodedVariableInterpreter.hpp src/ErrorCode.hpp + src/ffi/encoding_methods.cpp + src/ffi/encoding_methods.hpp + src/ffi/encoding_methods.tpp + src/ffi/ir_stream/decoding_methods.cpp + src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/decoding_methods.tpp src/FileReader.cpp src/FileReader.hpp src/FileWriter.cpp @@ -424,6 +433,7 @@ set(SOURCE_FILES_clg src/GlobalSQLiteMetadataDB.hpp src/Grep.cpp src/Grep.hpp + src/ir/LogEvent.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp @@ -567,12 +577,19 @@ set(SOURCE_FILES_clo src/EncodedVariableInterpreter.cpp src/EncodedVariableInterpreter.hpp src/ErrorCode.hpp + src/ffi/encoding_methods.cpp + src/ffi/encoding_methods.hpp + src/ffi/encoding_methods.tpp + src/ffi/ir_stream/decoding_methods.cpp + src/ffi/ir_stream/decoding_methods.hpp + src/ffi/ir_stream/decoding_methods.tpp src/FileReader.cpp src/FileReader.hpp src/FileWriter.cpp src/FileWriter.hpp src/Grep.cpp src/Grep.hpp + src/ir/LogEvent.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp @@ -687,6 +704,7 @@ set(SOURCE_FILES_unitTest src/clp/decompression.hpp src/clp/FileCompressor.cpp src/clp/FileCompressor.hpp + src/clp/FileCompressor.tpp src/clp/FileDecompressor.cpp src/clp/FileDecompressor.hpp src/clp/FileToCompress.cpp @@ -771,8 +789,9 @@ set(SOURCE_FILES_unitTest src/GlobalSQLiteMetadataDB.hpp src/Grep.cpp src/Grep.hpp - src/IrMessageParser.cpp - src/IrMessageParser.hpp + src/ir/LogEvent.hpp + src/ir/utils.cpp + src/ir/utils.hpp src/LibarchiveFileReader.cpp src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp @@ -794,8 +813,6 @@ set(SOURCE_FILES_unitTest src/MySQLPreparedStatement.hpp src/PageAllocatedVector.cpp src/PageAllocatedVector.hpp - src/ParsedIrMessage.cpp - src/ParsedIrMessage.hpp src/ParsedMessage.cpp src/ParsedMessage.hpp src/Platform.hpp @@ -829,6 +846,7 @@ set(SOURCE_FILES_unitTest src/streaming_archive/reader/SegmentManager.hpp src/streaming_archive/writer/Archive.cpp src/streaming_archive/writer/Archive.hpp + src/streaming_archive/writer/Archive.tpp src/streaming_archive/writer/File.cpp src/streaming_archive/writer/File.hpp src/streaming_archive/writer/Segment.cpp diff --git a/components/core/src/EncodedVariableInterpreter.cpp b/components/core/src/EncodedVariableInterpreter.cpp index 22fbba4fd..246897ea7 100644 --- a/components/core/src/EncodedVariableInterpreter.cpp +++ b/components/core/src/EncodedVariableInterpreter.cpp @@ -7,6 +7,7 @@ // Project headers #include "Defs.h" #include "ffi/encoding_methods.hpp" +#include "ffi/ir_stream/decoding_methods.hpp" #include "spdlog_with_specializations.hpp" #include "string_utils.hpp" #include "type_utils.hpp" @@ -195,39 +196,6 @@ void EncodedVariableInterpreter::convert_encoded_float_to_string (encoded_variab value[value_length - 1 - decimal_pos] = '.'; } -void EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte( - encoded_variable_t four_bytes_float, - encoded_variable_t& eight_bytes_float -) { - four_bytes_float = bit_cast(four_bytes_float); - - size_t decimal_pos; - size_t num_digits; - size_t digits; - bool is_negative; - - // Decode according to the format described in encode_string_as_float_compact_var - decimal_pos = (four_bytes_float & 0x07) + 1; - four_bytes_float >>= 3; - num_digits = (four_bytes_float & 0x07) + 1; - four_bytes_float >>= 3; - digits = four_bytes_float & ffi::cFourByteEncodedFloatDigitsBitMask; - four_bytes_float >>= 25; - is_negative = four_bytes_float > 0; - - // encode again. - eight_bytes_float = 0; - if (is_negative) { - eight_bytes_float = 1; - } - eight_bytes_float <<= 55; // 1 unused + 54 for digits of the float - eight_bytes_float |= digits & cEightByteEncodedFloatDigitsBitMask; - eight_bytes_float <<= 4; - eight_bytes_float |= (num_digits - 1) & 0x0F; - eight_bytes_float <<= 4; - eight_bytes_float |= (decimal_pos - 1) & 0x0F; -} - void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& message, LogTypeDictionaryEntry& logtype_dict_entry, VariableDictionaryWriter& var_dict, vector& encoded_vars, vector& var_ids) @@ -240,24 +208,79 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& mes // To avoid reallocating the logtype as we append to it, reserve enough space to hold the entire message logtype_dict_entry.reserve_constant_length(message.length()); while (logtype_dict_entry.parse_next_var(message, var_begin_pos, var_end_pos, var_str)) { - // Encode variable - encoded_variable_t encoded_var; - if (convert_string_to_representable_integer_var(var_str, encoded_var)) { - logtype_dict_entry.add_int_var(); - } else if (convert_string_to_representable_float_var(var_str, encoded_var)) { - logtype_dict_entry.add_float_var(); - } else { - // Variable string looks like a dictionary variable, so encode it as so - variable_dictionary_id_t id; - var_dict.add_entry(var_str, id); - encoded_var = encode_var_dict_id(id); - var_ids.push_back(id); + auto encoded_var = encode_var(var_str, logtype_dict_entry, var_dict, var_ids); + encoded_vars.push_back(encoded_var); + } +} - logtype_dict_entry.add_dictionary_var(); - } +template +void EncodedVariableInterpreter::encode_and_add_to_dictionary( + ir::LogEvent const& log_event, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars, + std::vector& var_ids, + size_t& raw_num_bytes +) { + logtype_dict_entry.clear(); + logtype_dict_entry.reserve_constant_length(log_event.get_logtype().length()); + + raw_num_bytes = 0; + + auto constant_handler = [&](std::string const& value, size_t begin_pos, size_t length) { + raw_num_bytes += length; + logtype_dict_entry.add_constant(value, begin_pos, length); + }; + + auto encoded_int_handler = [&](encoded_variable_t encoded_var) { + raw_num_bytes += ffi::decode_integer_var(encoded_var).length(); + logtype_dict_entry.add_int_var(); + ffi::eight_byte_encoded_variable_t eight_byte_encoded_var{}; + if constexpr (std::is_same_v) { + eight_byte_encoded_var = encoded_var; + } else { // std::is_same_v + eight_byte_encoded_var = ffi::encode_four_byte_integer_as_eight_byte(encoded_var); + } + encoded_vars.push_back(eight_byte_encoded_var); + }; + + auto encoded_float_handler = [&](ffi::four_byte_encoded_variable_t encoded_var) { + raw_num_bytes += ffi::decode_float_var(encoded_var).length(); + logtype_dict_entry.add_float_var(); + + ffi::eight_byte_encoded_variable_t eight_byte_encoded_var{}; + if constexpr (std::is_same_v) { + eight_byte_encoded_var = encoded_var; + } else { // std::is_same_v + eight_byte_encoded_var = ffi::encode_four_byte_float_as_eight_byte(encoded_var); + } + encoded_vars.push_back(eight_byte_encoded_var); + }; + + auto dict_var_handler = [&](string const& dict_var) { + raw_num_bytes += dict_var.length(); + + ffi::eight_byte_encoded_variable_t encoded_var{}; + if constexpr (std::is_same_v) { + encoded_var = encode_var_dict_id( + add_dict_var(dict_var, logtype_dict_entry, var_dict, var_ids) + ); + } else { // std::is_same_v + encoded_var = encode_var(dict_var, logtype_dict_entry, var_dict, var_ids); + } encoded_vars.push_back(encoded_var); - } + }; + + ffi::ir_stream::generic_decode_message( + log_event.get_logtype(), + log_event.get_encoded_vars(), + log_event.get_dict_vars(), + constant_handler, + encoded_int_handler, + encoded_float_handler, + dict_var_handler + ); } bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDictionaryEntry& logtype_dict_entry, const VariableDictionaryReader& var_dict, @@ -370,3 +393,58 @@ bool EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matc encoded_variable_t EncodedVariableInterpreter::encode_var_dict_id (variable_dictionary_id_t id) { return bit_cast(id); } + +encoded_variable_t EncodedVariableInterpreter::encode_var( + string const& var, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + vector& var_ids +) { + encoded_variable_t encoded_var{0}; + if (convert_string_to_representable_integer_var(var, encoded_var)) { + logtype_dict_entry.add_int_var(); + } else if (convert_string_to_representable_float_var(var, encoded_var)) { + logtype_dict_entry.add_float_var(); + } else { + // Variable string looks like a dictionary variable, so encode it as so + encoded_var = encode_var_dict_id(add_dict_var(var, logtype_dict_entry, var_dict, var_ids)); + } + return encoded_var; +} + +variable_dictionary_id_t EncodedVariableInterpreter::add_dict_var( + string const& var, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + vector& var_ids +) { + variable_dictionary_id_t id{cVariableDictionaryIdMax}; + var_dict.add_entry(var, id); + var_ids.push_back(id); + + logtype_dict_entry.add_dictionary_var(); + + return id; +} + +// Explicitly declare template specializations so that we can define the +// template methods in this file +template +void EncodedVariableInterpreter::encode_and_add_to_dictionary( + ir::LogEvent const& log_event, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars, + std::vector& var_ids, + size_t& raw_num_bytes +); + +template +void EncodedVariableInterpreter::encode_and_add_to_dictionary( + ir::LogEvent const& log_event, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars, + std::vector& var_ids, + size_t& raw_num_bytes +); diff --git a/components/core/src/EncodedVariableInterpreter.hpp b/components/core/src/EncodedVariableInterpreter.hpp index 115120f42..d33cb7ec4 100644 --- a/components/core/src/EncodedVariableInterpreter.hpp +++ b/components/core/src/EncodedVariableInterpreter.hpp @@ -6,6 +6,7 @@ #include // Project headers +#include "ir/LogEvent.hpp" #include "Query.hpp" #include "TraceableException.hpp" #include "VariableDictionaryReader.hpp" @@ -63,15 +64,6 @@ class EncodedVariableInterpreter { */ static void convert_encoded_float_to_string (encoded_variable_t encoded_var, std::string& value); - /** - * Converts the four bytes encoded float to eight byte encoded float - * @param four_bytes_float - * @param eight_bytes_float - */ - static void convert_four_bytes_float_to_eight_byte( - encoded_variable_t four_bytes_float, - encoded_variable_t& eight_bytes_float - ); /** * Parses all variables from a message (while constructing the logtype) and encodes them (adding them to the variable dictionary if necessary) * @param message @@ -82,6 +74,32 @@ class EncodedVariableInterpreter { */ static void encode_and_add_to_dictionary (const std::string& message, LogTypeDictionaryEntry& logtype_dict_entry, VariableDictionaryWriter& var_dict, std::vector& encoded_vars, std::vector& var_ids); + + /** + * Encodes the given IR log event, constructing a logtype dictionary entry, + * and adding any dictionary variables to the dictionary. NOTE: Four-byte + * encoded variables will be converted to eight-byte encoded variables. + * @tparam encoded_variable_t The type of the encoded variables in the log + * event + * @param log_event + * @param logtype_dict_entry + * @param var_dict + * @param encoded_vars A container to store the encoded variables in + * @param var_ids A container to store the dictionary IDs for dictionary + * variables + * @param raw_num_bytes Returns an estimate of the number of bytes that + * this log event would occupy if it was not encoded in CLP's IR + */ + template + static void encode_and_add_to_dictionary( + ir::LogEvent const& log_event, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars, + std::vector& var_ids, + size_t& raw_num_bytes + ); + /** * Decodes all variables and decompresses them into a message * @param logtype_dict_entry @@ -115,6 +133,40 @@ class EncodedVariableInterpreter { */ static bool wildcard_search_dictionary_and_get_encoded_matches (const std::string& var_wildcard_str, const VariableDictionaryReader& var_dict, bool ignore_case, SubQuery& sub_query); + +private: + /** + * Encodes the given string as a dictionary or non-dictionary variable and + * adds a corresponding placeholder to the logtype + * @param var + * @param logtype_dict_entry + * @param var_dict + * @param var_ids A container to add the dictionary ID to (if the string is + * a dictionary variable) + * @return The encoded variable + */ + static encoded_variable_t encode_var( + std::string const& var, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& var_ids + ); + + /** + * Adds the given string to the variable dictionary and adds a corresponding + * placeholder to logtype + * @param var + * @param logtype_dict_entry + * @param var_dict + * @param var_ids A container to add the dictionary ID to + * @return The dictionary ID + */ + static variable_dictionary_id_t add_dict_var( + std::string const& var, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& var_ids + ); }; #endif // ENCODEDVARIABLEINTERPRETER_HPP diff --git a/components/core/src/IrMessageParser.cpp b/components/core/src/IrMessageParser.cpp deleted file mode 100644 index fd0ed0e44..000000000 --- a/components/core/src/IrMessageParser.cpp +++ /dev/null @@ -1,253 +0,0 @@ -#include "IrMessageParser.hpp" - -#include -#include - -#include "BufferReader.hpp" -#include "EncodedVariableInterpreter.hpp" -#include "ffi/encoding_methods.hpp" -#include "ffi/ir_stream/protocol_constants.hpp" - -using ffi::cVariablePlaceholderEscapeCharacter; -using ffi::eight_byte_encoded_variable_t; -using ffi::four_byte_encoded_variable_t; -using ffi::ir_stream::cProtocol::MagicNumberLength; -using ffi::ir_stream::IRErrorCode; -using ffi::VariablePlaceholder; -using std::string; -using std::vector; - -/** - * Constructs the class by setting the internal reader, parsing the metadata - * and initializing variable based on the metadata - * @param reader - * @throw OperationFailed if the reader doesn't contain IR encoded data, - * or IR data that can't be properly decoded - */ -IrMessageParser::IrMessageParser(ReaderInterface& reader) : m_reader(reader) { - if (ffi::ir_stream::IRErrorCode_Success - != ffi::ir_stream::get_encoding_type(reader, m_is_four_bytes_encoded)) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - string json_metadata; - if (false == decode_json_preamble(json_metadata)) { - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - string const mocked_ts_pattern = "%Y-%m-%dT%H:%M:%S.%3"; - try { - auto metadata_json = nlohmann::json::parse(json_metadata); - string version = metadata_json.at(ffi::ir_stream::cProtocol::Metadata::VersionKey); - if (version != ffi::ir_stream::cProtocol::Metadata::VersionValue) { - SPDLOG_ERROR("Input IR has unsupported version {}", version); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - // For now, use a fixed timestamp pattern - m_ts_pattern = TimestampPattern(0, mocked_ts_pattern); - - if (m_is_four_bytes_encoded) { - m_reference_timestamp = std::stoll( - metadata_json.at(ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey) - .get() - ); - m_msg.set_ts(m_reference_timestamp); - } - - } catch (nlohmann::json::parse_error const& e) { - SPDLOG_ERROR("Failed to parse json metadata from reader"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - m_msg.set_ts_pattern(&m_ts_pattern); -} - -auto IrMessageParser::parse_next_encoded_message() -> bool { - if (m_is_four_bytes_encoded) { - return parse_next_four_bytes_message(); - } - return parse_next_eight_bytes_message(); -} - -auto IrMessageParser::is_ir_encoded(size_t sequence_length, char const* data) -> bool { - if (sequence_length < MagicNumberLength) { - return false; - } - bool is_four_bytes_encoded{false}; - BufferReader encoding_data(data, MagicNumberLength); - if (ffi::ir_stream::IRErrorCode_Success - != ffi::ir_stream::get_encoding_type(encoding_data, is_four_bytes_encoded)) - { - return false; - } - return true; -} - -auto IrMessageParser::parse_next_eight_bytes_message() -> bool { - m_msg.clear(); - - epochtime_t ts{0}; - vector encoded_vars; - vector dict_vars; - string logtype; - - auto error_code - = ffi::ir_stream::generic_parse_tokens(m_reader, logtype, encoded_vars, dict_vars, ts); - - if (IRErrorCode::IRErrorCode_Success != error_code) { - if (IRErrorCode::IRErrorCode_Eof != error_code) { - SPDLOG_ERROR("Corrupted IR, error code: {}", error_code); - } - return false; - } - - auto constant_handler = [this](std::string const& value, size_t begin_pos, size_t length) { - m_msg.append_to_logtype(value, begin_pos, length); - }; - - auto constant_remainder_handler = [this](std::string const& value, size_t begin_pos) { - auto const remaining_size = value.length() - begin_pos; - m_msg.append_to_logtype(value, begin_pos, remaining_size); - }; - - auto encoded_int_handler = [this](eight_byte_encoded_variable_t value) { - auto decoded_int = ffi::decode_integer_var(value); - m_msg.add_encoded_integer(value, decoded_int.length()); - }; - - auto encoded_float_handler = [this](eight_byte_encoded_variable_t encoded_float) { - auto decoded_float = ffi::decode_float_var(encoded_float); - m_msg.add_encoded_float(encoded_float, decoded_float.size()); - }; - - auto dict_var_handler = [this](string const& dict_var) { m_msg.add_dictionary_var(dict_var); }; - - // handle timestamp - m_msg.set_ts(ts); - try { - ffi::ir_stream::generic_decode_message( - logtype, - encoded_vars, - dict_vars, - constant_handler, - constant_remainder_handler, - encoded_int_handler, - encoded_float_handler, - dict_var_handler - ); - } catch (ffi::ir_stream::DecodingException& e) { - SPDLOG_ERROR("Decoding failed with exception {}", e.what()); - return false; - } - - return true; -} - -auto IrMessageParser::parse_next_four_bytes_message() -> bool { - m_msg.clear(); - - epochtime_t ts{0}; - vector encoded_vars; - vector dict_vars; - string logtype; - - auto error_code - = ffi::ir_stream::generic_parse_tokens(m_reader, logtype, encoded_vars, dict_vars, ts); - - if (IRErrorCode::IRErrorCode_Success != error_code) { - if (IRErrorCode::IRErrorCode_Eof != error_code) { - SPDLOG_ERROR("Corrupted IR, error code: {}", error_code); - } - return false; - } - - auto constant_handler = [this](std::string const& value, size_t begin_pos, size_t length) { - m_msg.append_to_logtype(value, begin_pos, length); - }; - - auto constant_remainder_handler = [this](std::string const& value, size_t begin_pos) { - auto const remaining_size = value.length() - begin_pos; - m_msg.append_to_logtype(value, begin_pos, remaining_size); - }; - - auto encoded_int_handler = [this](four_byte_encoded_variable_t value) { - // assume that we need the actual size - auto decoded_int = ffi::decode_integer_var(value); - m_msg.add_encoded_integer(value, decoded_int.length()); - }; - - auto encoded_float_handler = [this](four_byte_encoded_variable_t encoded_float) { - const auto original_size_in_bytes = ffi::decode_float_var(encoded_float).size(); - eight_byte_encoded_variable_t converted_float{0}; - EncodedVariableInterpreter::convert_four_bytes_float_to_eight_byte( - encoded_float, - converted_float - ); - m_msg.add_encoded_float(converted_float, original_size_in_bytes); - }; - - auto dict_var_handler = [this](string const& dict_var) { - encoded_variable_t converted_var{0}; - if (EncodedVariableInterpreter::convert_string_to_representable_integer_var( - dict_var, - converted_var - )) - { - m_msg.add_encoded_integer(converted_var, dict_var.size()); - } else if (EncodedVariableInterpreter::convert_string_to_representable_float_var( - dict_var, - converted_var - )) - { - m_msg.add_encoded_float(converted_var, dict_var.size()); - } else { - m_msg.add_dictionary_var(dict_var); - } - }; - - // handle timestamp - m_reference_timestamp += ts; - m_msg.set_ts(m_reference_timestamp); - try { - ffi::ir_stream::generic_decode_message( - logtype, - encoded_vars, - dict_vars, - constant_handler, - constant_remainder_handler, - encoded_int_handler, - encoded_float_handler, - dict_var_handler - ); - } catch (ffi::ir_stream::DecodingException& e) { - SPDLOG_ERROR("Decoding failed with exception {}", e.what()); - return false; - } - - return true; -} - -auto IrMessageParser::decode_json_preamble(std::string& json_metadata) -> bool { - // Decode and parse metadata - ffi::ir_stream::encoded_tag_t metadata_type{0}; - std::vector metadata_vec; - - if (ffi::ir_stream::IRErrorCode_Success - != ffi::ir_stream::decode_preamble(m_reader, metadata_type, metadata_vec)) - { - SPDLOG_ERROR("Failed to parse metadata"); - return false; - } - - if (ffi::ir_stream::cProtocol::Metadata::EncodingJson != metadata_type) { - SPDLOG_ERROR("Unexpected metadata type {}", metadata_type); - return false; - } - - json_metadata.assign( - size_checked_pointer_cast(metadata_vec.data()), - metadata_vec.size() - ); - - return true; -} diff --git a/components/core/src/IrMessageParser.hpp b/components/core/src/IrMessageParser.hpp deleted file mode 100644 index 4d1ca63d5..000000000 --- a/components/core/src/IrMessageParser.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef IrMessageParser_HPP -#define IrMessageParser_HPP - -#include "ffi/ir_stream/decoding_methods.hpp" -#include "ParsedIrMessage.hpp" -#include "TraceableException.hpp" - -/* - * Class representing the parser that parses messages from encoded IR and - * converts the message into CLP encoding format - */ -class IrMessageParser { -public: - // Types - class OperationFailed : public TraceableException { - public: - // Constructors - OperationFailed(ErrorCode error_code, char const* const filename, int line_number) - : TraceableException(error_code, filename, line_number) {} - - // Methods - [[nodiscard]] auto what() const noexcept -> char const* override { - return "IrMessageParser operation failed"; - } - }; - - // Constructor - explicit IrMessageParser(ReaderInterface& reader); - - // Methods - auto get_ts_pattern() const -> TimestampPattern const* { return &m_ts_pattern; } - - [[nodiscard]] auto get_parsed_msg() const -> ParsedIrMessage const& { return m_msg; } - - auto get_msg_logtype_entry() -> LogTypeDictionaryEntry& { return m_msg.get_logtype_entry(); } - - [[nodiscard]] auto parse_next_encoded_message() -> bool; - static auto is_ir_encoded(size_t sequence_length, char const* data) -> bool; - -private: - [[nodiscard]] auto parse_next_four_bytes_message() -> bool; - [[nodiscard]] auto parse_next_eight_bytes_message() -> bool; - [[nodiscard]] auto decode_json_preamble(std::string& json_metadata) -> bool; - - // member variables - bool m_is_four_bytes_encoded{false}; - epochtime_t m_reference_timestamp; - TimestampPattern m_ts_pattern; - ParsedIrMessage m_msg; - ReaderInterface& m_reader; -}; - -#endif // IrMessageParser_HPP diff --git a/components/core/src/ParsedIrMessage.cpp b/components/core/src/ParsedIrMessage.cpp deleted file mode 100644 index d040d9360..000000000 --- a/components/core/src/ParsedIrMessage.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include "ParsedIrMessage.hpp" - -#include - -#include "LogTypeDictionaryEntry.hpp" - -using std::string; - -auto ParsedIrMessage::clear() -> void { - m_ts_patt = nullptr; - m_ts_bytes = 0; - clear_except_ts_patt(); -} - -auto ParsedIrMessage::clear_except_ts_patt() -> void { - m_variables.clear(); - m_orig_num_bytes = 0; - m_ts = 0; - m_logtype_entry.clear(); -} - -auto ParsedIrMessage::set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void { - if (m_ts_patt != nullptr) { - SPDLOG_ERROR("Can not set different timestamp for an IR file"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - m_ts_patt = timestamp_pattern; - // get a rough estimation of ts string size - string empty_msg; - m_ts_patt->insert_formatted_timestamp(0, empty_msg); - m_ts_bytes = empty_msg.length(); -} - -auto ParsedIrMessage::set_ts(epochtime_t ts) -> void { - m_ts = ts; - if (ts != 0) { - // instead of converting the ts back to string to get the size, we use - // a previously calculated approximated value - m_orig_num_bytes += m_ts_bytes; - } -} - -auto ParsedIrMessage::append_to_logtype(string const& value, size_t begin_pos, size_t length) - -> void { - m_logtype_entry.add_constant(value, begin_pos, length); - m_orig_num_bytes += length; -} - -auto ParsedIrMessage::add_dictionary_var(string const& dictionary_var) -> void { - m_variables.emplace_back(dictionary_var); - m_logtype_entry.add_dictionary_var(); - m_orig_num_bytes += dictionary_var.size(); -} - -auto ParsedIrMessage::add_encoded_integer(encoded_variable_t var, size_t orginal_size_in_bytes) - -> void { - m_variables.emplace_back(var); - m_logtype_entry.add_int_var(); - m_orig_num_bytes += orginal_size_in_bytes; -} - -auto ParsedIrMessage::add_encoded_float(encoded_variable_t var, size_t orginal_size_in_bytes) - -> void { - m_variables.emplace_back(var); - m_logtype_entry.add_float_var(); - m_orig_num_bytes += orginal_size_in_bytes; -} diff --git a/components/core/src/ParsedIrMessage.hpp b/components/core/src/ParsedIrMessage.hpp deleted file mode 100644 index 683f80cad..000000000 --- a/components/core/src/ParsedIrMessage.hpp +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef PARSEDIRMESSAGE_HPP -#define PARSEDIRMESSAGE_HPP - -#include -#include -#include -#include - -#include "Defs.h" -#include "LogTypeDictionaryEntry.hpp" -#include "TimestampPattern.hpp" - -/** - * ParsedIRMessage represents a (potentially multiline) log message parsed from encoded ir - * into four primary fields: logtype_entry, variables, timestamp and timestamp pattern. - */ -class ParsedIrMessage { -public: - // Types - class OperationFailed : public TraceableException { - public: - // Constructors - OperationFailed(ErrorCode error_code, char const* const filename, int line_number) - : TraceableException(error_code, filename, line_number) {} - - // Methods - [[nodiscard]] auto what() const noexcept -> char const* override { - return "ParsedIrMessage operation failed"; - } - }; - - enum class VariableType { - EncodedVar = 0, - DictVar, - Length - }; - - // Helper class to keep variables in the order as they appear in the - // original log messages - class IrVariable { - public: - explicit IrVariable(std::string dict_var) - : m_dict_var(std::move(dict_var)), - m_type(VariableType::DictVar) {} - - explicit IrVariable(encoded_variable_t encoded_var) - : m_encoded_var(encoded_var), - m_type(VariableType::EncodedVar) {} - - // Methods - [[nodiscard]] auto type() const -> VariableType { return m_type; } - - [[nodiscard]] auto get_encoded_var() const -> encoded_variable_t { - assert(m_type == VariableType::EncodedVar); - return m_encoded_var; - } - - [[nodiscard]] auto get_dict_var() const -> std::string const& { - assert(m_type == VariableType::DictVar); - return m_dict_var; - } - - private: - std::string m_dict_var; - encoded_variable_t m_encoded_var{0}; - VariableType m_type; - }; - - // Methods - auto clear() -> void; - - auto clear_except_ts_patt() -> void; - - auto set_ts_pattern(TimestampPattern const* timestamp_pattern) -> void; - - auto set_ts(epochtime_t ts) -> void; - - auto append_to_logtype(std::string const& value, size_t begin_pos, size_t length) -> void; - - auto add_encoded_integer(encoded_variable_t var, size_t original_size_in_bytes) -> void; - - auto add_encoded_float(encoded_variable_t var, size_t original_size_in_bytes) -> void; - - auto add_dictionary_var(std::string const& dictionary_var) -> void; - - [[nodiscard]] auto get_ts() const -> epochtime_t { return m_ts; } - - auto get_logtype_entry() -> LogTypeDictionaryEntry& { return m_logtype_entry; } - - [[nodiscard]] auto get_vars() const -> std::vector const& { return m_variables; } - - [[nodiscard]] auto get_orig_num_bytes() const -> size_t { return m_orig_num_bytes; } - -private: - // Variables - TimestampPattern const* m_ts_patt{nullptr}; - epochtime_t m_ts{0}; - LogTypeDictionaryEntry m_logtype_entry; - std::vector m_variables; - size_t m_orig_num_bytes{0}; - size_t m_ts_bytes{0}; -}; - -#endif // PARSEDIRMESSAGE_HPP diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 768634755..c16e2eebd 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -12,10 +12,12 @@ #include // Project headers -#include "../IrMessageParser.hpp" +#include "../ir/utils.hpp" #include "../Profiler.hpp" #include "utils.hpp" +using ir::has_ir_stream_magic_number; +using ir::LogEventDeserializer; using std::cout; using std::endl; using std::set; @@ -274,13 +276,13 @@ namespace clp { succeeded = false; continue; } - auto file_path = std::string(m_libarchive_reader.get_path()); char const* utf8_validation_buf{nullptr}; size_t utf8_validation_buf_len{0}; m_libarchive_file_reader.peek_buffered_data( utf8_validation_buf, utf8_validation_buf_len ); + string file_path{m_libarchive_reader.get_path()}; if (is_utf8_sequence(utf8_validation_buf_len, utf8_validation_buf)) { auto boost_path_for_compression = parent_boost_path / file_path; if (use_heuristic) { @@ -291,26 +293,27 @@ namespace clp { parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, boost_path_for_compression.string(), file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); } - } else if (IrMessageParser::is_ir_encoded(utf8_validation_buf_len, - utf8_validation_buf)) { + } else if (has_ir_stream_magic_number({utf8_validation_buf, utf8_validation_buf_len})) { // Remove .clp suffix if found - if (file_path.length() > 4 && - file_path.substr(file_path.length() - 4) == ".clp") - { - file_path = file_path.substr(0, file_path.length() - 4); + static constexpr char cIrStreamExtension[] = ".clp"; + if (boost::iends_with(file_path, cIrStreamExtension)) { + file_path.resize(file_path.length() - strlen(cIrStreamExtension)); } auto boost_path_for_compression = parent_boost_path / file_path; - if (false == try_compressing_as_ir(target_data_size_of_dicts, - archive_user_config, - target_encoded_file_size, - boost_path_for_compression.string(), - file_to_compress.get_group_id(), - archive_writer, - m_libarchive_file_reader)) { - SPDLOG_ERROR("Failed to compress {} - corrupted IR", file_path); + + if (false == compress_ir_stream( + target_data_size_of_dicts, + archive_user_config, + target_encoded_file_size, + boost_path_for_compression.string(), + file_to_compress.get_group_id(), + archive_writer, + m_libarchive_file_reader + )) { + succeeded = false; } } else { - SPDLOG_ERROR("Cannot compress {} - not UTF-8 or IR encoded", file_path); + SPDLOG_ERROR("Cannot compress {} - not an IR stream or UTF-8 encoded", file_path); succeeded = false; } @@ -323,61 +326,89 @@ namespace clp { return succeeded; } - bool FileCompressor::try_compressing_as_ir( + bool FileCompressor::compress_ir_stream( size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, size_t target_encoded_file_size, - std::string const& path_for_compression, + string const& path, group_id_t group_id, streaming_archive::writer::Archive& archive_writer, ReaderInterface& reader ) { - try { - // Construct the IrMessageParser which parse encoding type and - // metadata as part of the construction process - IrMessageParser ir_message_parser(reader); - - // Open compressed file - archive_writer - .create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - - // Assume one encoded file only has one timestamp pattern - archive_writer.change_ts_pattern(ir_message_parser.get_ts_pattern()); + bool uses_four_byte_encoding{false}; + auto ir_error_code = ffi::ir_stream::get_encoding_type(reader, uses_four_byte_encoding); + if (ffi::ir_stream::IRErrorCode_Success != ir_error_code) { + SPDLOG_ERROR("Cannot compress {}, IR error={}", path, static_cast(ir_error_code)); + return false; + } - while (true) { - if (false == ir_message_parser.parse_next_encoded_message()) { - break; - } - if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { - split_file_and_archive( + try { + std::error_code error_code{}; + if (uses_four_byte_encoding) { + auto result + = LogEventDeserializer::create(reader); + if (result.has_error()) { + error_code = result.error(); + } else { + error_code = compress_ir_stream_by_encoding( + target_data_size_of_dicts, archive_user_config, - path_for_compression, + target_encoded_file_size, + path, group_id, - ir_message_parser.get_ts_pattern(), - archive_writer + archive_writer, + result.value() ); - } else if (archive_writer.get_file().get_encoded_size_in_bytes() >= - target_encoded_file_size) - { - split_file( - path_for_compression, + } + } else { + auto result + = LogEventDeserializer::create(reader); + if (result.has_error()) { + error_code = result.error(); + } else { + error_code = compress_ir_stream_by_encoding( + target_data_size_of_dicts, + archive_user_config, + target_encoded_file_size, + path, group_id, - ir_message_parser.get_ts_pattern(), - archive_writer + archive_writer, + result.value() ); } - auto const& parsed_msg = ir_message_parser.get_parsed_msg(); - archive_writer.write_ir_message( - parsed_msg.get_ts(), - ir_message_parser.get_msg_logtype_entry(), - parsed_msg.get_vars(), - parsed_msg.get_orig_num_bytes() + } + if (0 != error_code.value()) { + SPDLOG_ERROR( + "Failed to compress {} - {}:{}", + error_code.category().name(), + error_code.message() ); + return false; } - close_file_and_append_to_segment(archive_writer); - return true; } catch (TraceableException& e) { + auto error_code = e.get_error_code(); + if (ErrorCode_errno == error_code) { + SPDLOG_ERROR( + "Failed to compress {} - {}:{} {}, errno={}", + path, + e.get_filename(), + e.get_line_number(), + e.what(), + errno + ); + } else { + SPDLOG_ERROR( + "Failed to compress {} - {}:{} {}, error_code={}", + path, + e.get_filename(), + e.get_line_number(), + e.what(), + error_code + ); + } return false; } + + return true; } } diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index 8b5fc83fd..d97d20211 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -1,14 +1,19 @@ #ifndef CLP_FILECOMPRESSOR_HPP #define CLP_FILECOMPRESSOR_HPP +// C++ standard libraries +#include + +// Boost libraries #include +// Project headers #include "../BufferedFileReader.hpp" #include "../compressor_frontend/LogParser.hpp" +#include "../ir/LogEventDeserializer.hpp" #include "../LibarchiveFileReader.hpp" #include "../LibarchiveReader.hpp" #include "../MessageParser.hpp" -#include "../ParsedIrMessage.hpp" #include "../ParsedMessage.hpp" #include "../streaming_archive/writer/Archive.hpp" #include "FileToCompress.hpp" @@ -73,22 +78,49 @@ namespace clp { streaming_archive::writer::Archive& archive_writer, bool use_heuristic); /** - * Parses and encodes IR from the given reader into the given archive_writer + * Compresses the IR stream from the given reader into the archive * @param target_data_size_of_dicts * @param archive_user_config * @param target_encoded_file_size - * @param path_for_compression + * @param path * @param group_id * @param archive_writer * @param reader + * @return Whether the IR stream was compressed successfully + */ + bool compress_ir_stream( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader + ); + + /** + * Compresses an IR stream using the eight-byte or four-byte encoding + * based on the given template parameter. + * @tparam encoded_variable_t + * @param target_data_size_of_dicts + * @param archive_user_config + * @param target_encoded_file_size + * @param path + * @param group_id + * @param archive + * @param log_event_deserializer + * @return An error code */ - bool try_compressing_as_ir (size_t target_data_size_of_dicts, - streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, - const std::string& path_for_compression, - group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, - ReaderInterface& reader); + template + std::error_code compress_ir_stream_by_encoding( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive, + ir::LogEventDeserializer& log_event_deserializer + ); // Variables boost::uuids::random_generator& m_uuid_generator; @@ -101,4 +133,6 @@ namespace clp { }; } +#include "FileCompressor.tpp" + #endif // CLP_FILECOMPRESSOR_HPP diff --git a/components/core/src/clp/FileCompressor.tpp b/components/core/src/clp/FileCompressor.tpp new file mode 100644 index 000000000..c189b9e20 --- /dev/null +++ b/components/core/src/clp/FileCompressor.tpp @@ -0,0 +1,56 @@ +#ifndef CLP_FILECOMPRESSOR_TPP +#define CLP_FILECOMPRESSOR_TPP + +#include "FileCompressor.hpp" +#include "utils.hpp" + +namespace clp { +template +auto FileCompressor::compress_ir_stream_by_encoding( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive, + ir::LogEventDeserializer& log_event_deserializer +) -> std::error_code { + archive.create_and_open_file(path, group_id, m_uuid_generator(), 0); + + // We assume an IR stream only has one timestamp pattern + auto timestamp_pattern = log_event_deserializer.get_timestamp_pattern(); + archive.change_ts_pattern(×tamp_pattern); + + std::error_code error_code{}; + while (true) { + auto result = log_event_deserializer.deserialize_log_event(); + if (result.has_error()) { + auto error = result.error(); + if (std::errc::no_message_available != error) { + error_code = error; + } + break; + } + + // Split archive/encoded file if necessary before writing the new event + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + split_file_and_archive( + archive_user_config, + path, + group_id, + ×tamp_pattern, + archive + ); + } else if (archive.get_file().get_encoded_size_in_bytes() >= target_encoded_file_size) { + split_file(path, group_id, ×tamp_pattern, archive); + } + + archive.write_log_event_ir(result.value()); + } + + close_file_and_append_to_segment(archive); + return error_code; +} +} // namespace clp + +#endif diff --git a/components/core/src/ffi/encoding_methods.cpp b/components/core/src/ffi/encoding_methods.cpp index 9b2b3441f..9a7b17d36 100644 --- a/components/core/src/ffi/encoding_methods.cpp +++ b/components/core/src/ffi/encoding_methods.cpp @@ -112,4 +112,28 @@ bool get_bounds_of_next_var( return (msg_length != begin_pos); } + +eight_byte_encoded_variable_t encode_four_byte_float_as_eight_byte( + four_byte_encoded_variable_t four_byte_encoded_var +) { + uint8_t decimal_point_pos{}; + uint8_t num_digits{}; + uint32_t digits{}; + bool is_negative{}; + decode_float_properties( + four_byte_encoded_var, is_negative, digits, num_digits, decimal_point_pos); + + return encode_float_properties( + is_negative, + digits, + num_digits, + decimal_point_pos + ); +} + +eight_byte_encoded_variable_t encode_four_byte_integer_as_eight_byte( + four_byte_encoded_variable_t four_byte_encoded_var +) { + return static_cast(four_byte_encoded_var); +} } // namespace ffi diff --git a/components/core/src/ffi/encoding_methods.hpp b/components/core/src/ffi/encoding_methods.hpp index ee24ca73d..52d05554f 100644 --- a/components/core/src/ffi/encoding_methods.hpp +++ b/components/core/src/ffi/encoding_methods.hpp @@ -132,6 +132,15 @@ bool get_bounds_of_next_var( template bool encode_float_string(std::string_view str, encoded_variable_t& encoded_var); +/** + * Encodes the given four-byte encoded float using the eight-byte encoding + * @param four_byte_encoded_var + * @return The float using the eight-byte encoding + */ +eight_byte_encoded_variable_t encode_four_byte_float_as_eight_byte( + four_byte_encoded_variable_t four_byte_encoded_var +); + /** * Encodes a float value with the given properties into an encoded variable * @tparam encoded_variable_t Type of the encoded variable @@ -154,6 +163,29 @@ encoded_variable_t encode_float_properties( size_t decimal_point_pos ); +/** + * Decodes an encoded float variable into its properties + * @tparam encoded_variable_t Type of the encoded variable + * @param encoded_var + * @param is_negative Returns whether the float is negative + * @param digits Returns the digits of the float, ignoring the decimal, as an + * integer + * @param num_digits Returns the number of digits in \p digits + * @param decimal_point_pos Returns the position of the decimal point from the + * right of the value + */ +template +void decode_float_properties( + encoded_variable_t encoded_var, + bool& is_negative, + std::conditional_t< + std::is_same_v, + uint32_t, + uint64_t>& digits, + uint8_t& num_digits, + uint8_t& decimal_point_pos +); + /** * Decodes the given encoded float variable into a string * @tparam encoded_variable_t Type of the encoded variable @@ -172,6 +204,16 @@ std::string decode_float_var(encoded_variable_t encoded_var); */ template bool encode_integer_string(std::string_view str, encoded_variable_t& encoded_var); + +/** + * Encodes the given four-byte encoded integer using the eight-byte encoding + * @param four_byte_encoded_var + * @return The integer using the eight-byte encoding + */ +eight_byte_encoded_variable_t encode_four_byte_integer_as_eight_byte( + four_byte_encoded_variable_t four_byte_encoded_var +); + /** * Decodes the given encoded integer variable into a string * @tparam encoded_variable_t Type of the encoded variable diff --git a/components/core/src/ffi/encoding_methods.tpp b/components/core/src/ffi/encoding_methods.tpp index c0053adc1..eacd93de4 100644 --- a/components/core/src/ffi/encoding_methods.tpp +++ b/components/core/src/ffi/encoding_methods.tpp @@ -165,17 +165,16 @@ encoded_variable_t encode_float_properties( } template -std::string decode_float_var(encoded_variable_t encoded_var) { - std::string value; - - uint8_t decimal_point_pos; - uint8_t num_digits; - std::conditional_t< - std::is_same_v, - uint32_t, - uint64_t> - digits; - bool is_negative; +void decode_float_properties( + encoded_variable_t encoded_var, + bool& is_negative, + std::conditional_t< + std::is_same_v, + uint32_t, + uint64_t>& digits, + uint8_t& num_digits, + uint8_t& decimal_point_pos +) { static_assert( (std::is_same_v || std::is_same_v) @@ -216,6 +215,21 @@ std::string decode_float_var(encoded_variable_t encoded_var) { encoded_float >>= 25; is_negative = encoded_float > 0; } +} + +template +std::string decode_float_var(encoded_variable_t encoded_var) { + std::string value; + + uint8_t decimal_point_pos; + uint8_t num_digits; + std::conditional_t< + std::is_same_v, + uint32_t, + uint64_t> + digits; + bool is_negative; + decode_float_properties(encoded_var, is_negative, digits, num_digits, decimal_point_pos); if (num_digits < decimal_point_pos) { throw EncodingException( diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 790a5ca6f..571abca8c 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -81,11 +81,9 @@ parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_m * encoded_variable_t == four_byte_encoded_variable_t or the actual timestamp if * encoded_variable_t == eight_byte_encoded_variable_t * @return IRErrorCode_Success on success - * @return IRErrorCode_Corrupted_IR if reader contains invalid IR * @return IRErrorCode_Decode_Error if the encoded message cannot be properly * decoded - * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data to - * decode + * @return Same as ffi::ir_stream::generic_parse_tokens */ template static IRErrorCode @@ -275,23 +273,19 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time return error_code; } - auto constant_handler = [&message](string const& value, size_t begin_pos, size_t length) { + auto constant_handler = [&](string const& value, size_t begin_pos, size_t length) { message.append(value, begin_pos, length); }; - auto constant_remainder_handler = [&message](string const& value, size_t begin_pos) { - message.append(value, begin_pos); - }; - - auto encoded_int_handler = [&message](encoded_variable_t value) { + auto encoded_int_handler = [&](encoded_variable_t value) { message.append(decode_integer_var(value)); }; - auto encoded_float_handler = [&message](encoded_variable_t encoded_float) { + auto encoded_float_handler = [&](encoded_variable_t encoded_float) { message.append(decode_float_var(encoded_float)); }; - auto dict_var_handler = [&message](string const& dict_var) { message.append(dict_var); }; + auto dict_var_handler = [&](string const& dict_var) { message.append(dict_var); }; try { generic_decode_message( @@ -299,7 +293,6 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time encoded_vars, dict_vars, constant_handler, - constant_remainder_handler, encoded_int_handler, encoded_float_handler, dict_var_handler diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 37ab8c93f..534992d7f 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -49,6 +49,7 @@ class DecodingException : public TraceableException { IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding); /** + * TODO Rename * Parse logtypes, dictionary variables and encoded variables * from the next encoded IR message. Returns the parsed tokens by * reference @@ -61,6 +62,7 @@ IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encod * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data + * @return IRErrorCode_Eof on reaching the end of the stream */ template auto generic_parse_tokens( @@ -77,8 +79,6 @@ auto generic_parse_tokens( * @tparam encoded_variable_t Type of the encoded variable * @tparam ConstantHandler Method to handle constants in the logtypes. * Signature: (const std::string&, size_t, size_t) -> void - * @tparam ConstantRemainderHandler Method to handle the last constant in the - * logtypes. Signature: (const std::string&, size_t) -> void * @tparam EncodedIntHandler Method to handle encoded integers. * Signature: (encoded_variable_t) -> void * @tparam EncodedFloatHandler Method to handle encoded float. @@ -89,7 +89,6 @@ auto generic_parse_tokens( * @param encoded_vars * @param dict_vars * @param constant_handler - * @param constant_remainder_handler * @param encoded_int_handler * @param encoded_float_handler * @param dict_var_handler @@ -98,7 +97,6 @@ auto generic_parse_tokens( template < typename encoded_variable_t, typename ConstantHandler, - typename ConstantRemainderHandler, typename EncodedIntHandler, typename EncodedFloatHandler, typename DictVarHandler> @@ -107,7 +105,6 @@ void generic_decode_message( std::vector const& encoded_vars, std::vector const& dict_vars, ConstantHandler constant_handler, - ConstantRemainderHandler constant_remainder_handler, EncodedIntHandler encoded_int_handler, EncodedFloatHandler encoded_float_handler, DictVarHandler dict_var_handler diff --git a/components/core/src/ffi/ir_stream/decoding_methods.tpp b/components/core/src/ffi/ir_stream/decoding_methods.tpp index f40a3c6c6..c02a933dc 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.tpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.tpp @@ -1,10 +1,9 @@ #ifndef FFI_IR_STREAM_DECODING_METHODS_TPP #define FFI_IR_STREAM_DECODING_METHODS_TPP -#include +#include #include -#include "../../ReaderInterface.hpp" #include "../encoding_methods.hpp" #include "decoding_methods.hpp" #include "protocol_constants.hpp" @@ -13,7 +12,6 @@ namespace ffi::ir_stream { template < typename encoded_variable_t, typename ConstantHandler, - typename ConstantRemainderHandler, typename EncodedIntHandler, typename EncodedFloatHandler, typename DictVarHandler> @@ -22,14 +20,13 @@ void generic_decode_message( std::vector const& encoded_vars, std::vector const& dict_vars, ConstantHandler constant_handler, - ConstantRemainderHandler constant_remainder_handler, EncodedIntHandler encoded_int_handler, EncodedFloatHandler encoded_float_handler, DictVarHandler dict_var_handler ) { - size_t const logtype_length = logtype.length(); - size_t const encoded_vars_length = encoded_vars.size(); - size_t const dict_vars_length = dict_vars.size(); + auto const logtype_length = logtype.length(); + auto const encoded_vars_length = encoded_vars.size(); + auto const dict_vars_length = dict_vars.size(); size_t next_static_text_begin_pos = 0; size_t dictionary_vars_ix = 0; @@ -132,7 +129,11 @@ void generic_decode_message( } // Add remainder if (next_static_text_begin_pos < logtype_length) { - constant_remainder_handler(logtype, next_static_text_begin_pos); + constant_handler( + logtype, + next_static_text_begin_pos, + logtype_length - next_static_text_begin_pos + ); } } } // namespace ffi::ir_stream diff --git a/components/core/src/ir/LogEvent.hpp b/components/core/src/ir/LogEvent.hpp new file mode 100644 index 000000000..c84eba2f8 --- /dev/null +++ b/components/core/src/ir/LogEvent.hpp @@ -0,0 +1,52 @@ +#ifndef IR_LOGEVENT_HPP +#define IR_LOGEVENT_HPP + +#include +#include + +#include "../Defs.h" +#include "../ffi/encoding_methods.hpp" + +namespace ir { +/** + * A class representing a log event encoded using CLP's IR + * @tparam encoded_variable_t The type of encoded variables in the event + */ +template +class LogEvent { +public: + // Constructors + LogEvent( + ffi::epoch_time_ms_t timestamp, + std::string logtype, + std::vector dict_vars, + std::vector encoded_vars + ) + : m_timestamp{timestamp}, + m_logtype{std::move(logtype)}, + m_dict_vars{std::move(dict_vars)}, + m_encoded_vars{std::move(encoded_vars)} {} + + // Methods + [[nodiscard]] auto get_timestamp() const -> ffi::epoch_time_ms_t { return m_timestamp; } + + [[nodiscard]] auto get_logtype() const -> std::string const& { return m_logtype; } + + [[nodiscard]] auto get_dict_vars() const -> std::vector const& { + return m_dict_vars; + } + + [[nodiscard]] auto get_encoded_vars() const -> std::vector const& { + return m_encoded_vars; + } + +private: + // Variables + ffi::epoch_time_ms_t m_timestamp; + std::string m_logtype; + std::vector m_dict_vars; + std::vector m_encoded_vars; +}; +} // namespace ir + +#endif // IR_LOGEVENT_HPP diff --git a/components/core/src/ir/LogEventDeserializer.hpp b/components/core/src/ir/LogEventDeserializer.hpp new file mode 100644 index 000000000..dd1e89759 --- /dev/null +++ b/components/core/src/ir/LogEventDeserializer.hpp @@ -0,0 +1,91 @@ +#ifndef IR_LOGEVENTDESERIALIZER_HPP +#define IR_LOGEVENTDESERIALIZER_HPP + +#include + +#include + +#include "../ffi/encoding_methods.hpp" +#include "../ReaderInterface.hpp" +#include "../TimestampPattern.hpp" +#include "../TraceableException.hpp" +#include "LogEvent.hpp" + +namespace ir { +/** + * Class for deserializing IR log events from an IR stream. + * + * TODO: We're currently returning std::errc error codes, but we should replace + * these with our own custom error codes (derived from std::error_code), ideally + * replacing IRErrorCode. + * @tparam encoded_variable_t Type of encoded variables in the stream + */ +template +class LogEventDeserializer { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + [[nodiscard]] auto what() const noexcept -> char const* override { + return "ir::LogEventParser operation failed"; + } + }; + + // Factory functions + /** + * Creates a log event deserializer for the given stream + * @param reader A reader for the IR stream + * @return A result containing the serializer or an error code indicating + * the failure + */ + static auto create(ReaderInterface& reader) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; + + // Delete copy constructor and assignment + LogEventDeserializer(LogEventDeserializer const&) = delete; + auto operator=(LogEventDeserializer const&) -> LogEventDeserializer& = delete; + + // Define default move constructor and assignment + LogEventDeserializer(LogEventDeserializer&&) = default; + auto operator=(LogEventDeserializer&&) -> LogEventDeserializer& = default; + + // Methods + [[nodiscard]] auto get_timestamp_pattern() const -> TimestampPattern const& { + return m_timestamp_pattern; + } + + /** + * Deserializes a log event from the stream + * @return A result containing the log event or an error code indicating + * the failure + */ + [[nodiscard]] auto deserialize_log_event() + -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; + +private: + // Constructors + explicit LogEventDeserializer(ReaderInterface& reader) : m_reader{reader} {} + + LogEventDeserializer(ReaderInterface& reader, ffi::epoch_time_ms_t ref_timestamp) + : m_reader{reader}, + m_prev_msg_timestamp{ref_timestamp} {} + + // Variables + TimestampPattern m_timestamp_pattern{0, "%Y-%m-%dT%H:%M:%S.%3"}; + [[no_unique_address]] std::conditional_t< + std::is_same_v, + ffi::epoch_time_ms_t, + EmptyType> + m_prev_msg_timestamp{}; + ReaderInterface& m_reader; +}; +} // namespace ir + +#include "LogEventDeserializer.tpp" + +#endif // IR_LOGEVENTDESERIALIZER_HPP diff --git a/components/core/src/ir/LogEventDeserializer.tpp b/components/core/src/ir/LogEventDeserializer.tpp new file mode 100644 index 000000000..e71d521d8 --- /dev/null +++ b/components/core/src/ir/LogEventDeserializer.tpp @@ -0,0 +1,110 @@ +#ifndef IR_LOGEVENTDESERIALIZER_TPP +#define IR_LOGEVENTDESERIALIZER_TPP + +#include + +#include + +#include "../ffi/ir_stream/decoding_methods.hpp" +#include "LogEventDeserializer.hpp" + +namespace ir { +template +auto LogEventDeserializer::create(ReaderInterface& reader) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result> { + ffi::ir_stream::encoded_tag_t metadata_type{0}; + std::vector metadata; + auto ir_error_code = ffi::ir_stream::decode_preamble(reader, metadata_type, metadata); + if (ffi::ir_stream::IRErrorCode_Success != ir_error_code) { + switch (ir_error_code) { + case ffi::ir_stream::IRErrorCode_Incomplete_IR: + return std::errc::result_out_of_range; + case ffi::ir_stream::IRErrorCode_Corrupted_IR: + default: + return std::errc::protocol_error; + } + } + + if (ffi::ir_stream::cProtocol::Metadata::EncodingJson != metadata_type) { + return std::errc::protocol_not_supported; + } + + // Parse metadata and validate version + auto metadata_json = nlohmann::json::parse(metadata, nullptr, false); + if (metadata_json.is_discarded()) { + return std::errc::protocol_error; + } + auto version_iter = metadata_json.find(ffi::ir_stream::cProtocol::Metadata::VersionKey); + if (metadata_json.end() == version_iter || false == version_iter->is_string()) { + return std::errc::protocol_error; + } + auto metadata_version = version_iter->get_ref(); + if (static_cast(ffi::ir_stream::cProtocol::Metadata::VersionValue) + != metadata_version) + { + return std::errc::protocol_not_supported; + } + + if constexpr (std::is_same_v) { + return LogEventDeserializer{reader}; + } else if constexpr (std::is_same_v) { + // Get reference timestamp + auto ref_timestamp_iter + = metadata_json.find(ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey); + if (metadata_json.end() == ref_timestamp_iter || false == ref_timestamp_iter->is_string()) { + return std::errc::protocol_error; + } + auto ref_timestamp_str = ref_timestamp_iter->get_ref(); + ffi::epoch_time_ms_t ref_timestamp{}; + if (false == convert_string_to_int(ref_timestamp_str, ref_timestamp)) { + return std::errc::protocol_error; + } + + return LogEventDeserializer{reader, ref_timestamp}; + } else { + static_assert(cAlwaysFalse); + } +} + +template +auto LogEventDeserializer::deserialize_log_event() + -> BOOST_OUTCOME_V2_NAMESPACE::std_result> { + ffi::epoch_time_ms_t timestamp_or_timestamp_delta{}; + std::string logtype; + std::vector dict_vars; + std::vector encoded_vars; + + auto ir_error_code = ffi::ir_stream::generic_parse_tokens( + m_reader, + logtype, + encoded_vars, + dict_vars, + timestamp_or_timestamp_delta + ); + if (ffi::ir_stream::IRErrorCode_Success != ir_error_code) { + switch (ir_error_code) { + case ffi::ir_stream::IRErrorCode_Eof: + return std::errc::no_message_available; + case ffi::ir_stream::IRErrorCode_Incomplete_IR: + return std::errc::result_out_of_range; + case ffi::ir_stream::IRErrorCode_Corrupted_IR: + default: + return std::errc::protocol_error; + } + } + + ffi::epoch_time_ms_t timestamp{}; + if constexpr (std::is_same_v) { + timestamp = timestamp_or_timestamp_delta; + } else if constexpr (std::is_same_v) { + m_prev_msg_timestamp += timestamp_or_timestamp_delta; + timestamp = m_prev_msg_timestamp; + } else { + static_assert(cAlwaysFalse); + } + + return LogEvent{timestamp, logtype, dict_vars, encoded_vars}; +} +} // namespace ir + +#endif // IR_LOGEVENTDESERIALIZER_TPP diff --git a/components/core/src/ir/utils.cpp b/components/core/src/ir/utils.cpp new file mode 100644 index 000000000..9f55b6678 --- /dev/null +++ b/components/core/src/ir/utils.cpp @@ -0,0 +1,13 @@ +#include "utils.hpp" + +#include "../BufferReader.hpp" +#include "../ffi/ir_stream/decoding_methods.hpp" + +namespace ir { +auto has_ir_stream_magic_number(std::string_view buf) -> bool { + BufferReader buf_reader{buf.data(), buf.size()}; + bool is_four_bytes_encoded{false}; + return ffi::ir_stream::IRErrorCode_Success + == ffi::ir_stream::get_encoding_type(buf_reader, is_four_bytes_encoded); +} +} // namespace ir diff --git a/components/core/src/ir/utils.hpp b/components/core/src/ir/utils.hpp new file mode 100644 index 000000000..8e48e44f4 --- /dev/null +++ b/components/core/src/ir/utils.hpp @@ -0,0 +1,15 @@ +#ifndef IR_UTILS_HPP +#define IR_UTILS_HPP + +#include + +namespace ir { +/** + * @param buf + * @return Whether the content in the buffer starts with one of the IR stream + * magic numbers + */ +auto has_ir_stream_magic_number(std::string_view buf) -> bool; +} // namespace ir + +#endif // IR_UTILS_HPP diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index fe30ddee8..ba09d4b29 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -269,50 +269,6 @@ namespace streaming_archive::writer { } } - void Archive::write_ir_message( - epochtime_t timestamp, - LogTypeDictionaryEntry& logtype_entry, - std::vector const& variables, - size_t num_uncompressed_bytes - ) { - // Encode logtype - logtype_dictionary_id_t logtype_id; - m_logtype_dict.add_entry(logtype_entry, logtype_id); - - vector encoded_vars; - vector var_ids; - // Encode variable base on type - for (auto const& var : variables) { - if (var.type() == ParsedIrMessage::VariableType::EncodedVar) { - encoded_vars.push_back(var.get_encoded_var()); - } else if (var.type() == ParsedIrMessage::VariableType::DictVar) { - variable_dictionary_id_t id; - m_var_dict.add_entry(var.get_dict_var(), id); - encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); - var_ids.push_back(id); - } else { - throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); - } - } - - m_file->write_encoded_msg( - timestamp, - logtype_id, - encoded_vars, - var_ids, - num_uncompressed_bytes - ); - - // Update segment indices - if (m_file->has_ts_pattern()) { - m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); - m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); - } else { - m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); - m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); - } - } - void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, const bool has_timestamp) { epochtime_t timestamp = 0; diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index d71dc6fcf..866789ade 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -19,8 +19,8 @@ #include "../../compressor_frontend/Token.hpp" #include "../../ErrorCode.hpp" #include "../../GlobalMetadataDB.hpp" +#include "../../ir/LogEvent.hpp" #include "../../LogTypeDictionaryWriter.hpp" -#include "../../ParsedIrMessage.hpp" #include "../../VariableDictionaryWriter.hpp" #include "../ArchiveMetadata.hpp" #include "../MetadataDB.hpp" @@ -131,19 +131,13 @@ namespace streaming_archive { namespace writer { */ void write_msg (epochtime_t timestamp, const std::string& message, size_t num_uncompressed_bytes); /** - * Encodes and writes a message to the current encoded file - * @param timestamp - * @param logtype_entry - * @param variables - * @param num_uncompressed_bytes - * @throw FileWriter::OperationFailed if any write fails + * Writes an IR log event to the current encoded file + * @tparam encoded_variable_t The type of the encoded variables in the + * log event + * @param log_event */ - void write_ir_message( - epochtime_t timestamp, - LogTypeDictionaryEntry& logtype_entry, - std::vector const& variables, - size_t num_uncompressed_bytes - ); + template + void write_log_event_ir(ir::LogEvent const& log_event); /** * Encodes and writes a message to the given file using schema file * @param file @@ -313,4 +307,6 @@ namespace streaming_archive { namespace writer { }; } } +#include "Archive.tpp" + #endif // STREAMING_ARCHIVE_WRITER_ARCHIVE_HPP diff --git a/components/core/src/streaming_archive/writer/Archive.tpp b/components/core/src/streaming_archive/writer/Archive.tpp new file mode 100644 index 000000000..9c2c47dcf --- /dev/null +++ b/components/core/src/streaming_archive/writer/Archive.tpp @@ -0,0 +1,47 @@ +#ifndef STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP +#define STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP + +#include + +#include "../../EncodedVariableInterpreter.hpp" +#include "../../ir/LogEvent.hpp" + +namespace streaming_archive::writer { +template +void Archive::write_log_event_ir(ir::LogEvent const& log_event) { + std::vector encoded_vars; + std::vector var_ids; + size_t original_num_bytes{0}; + EncodedVariableInterpreter::encode_and_add_to_dictionary( + log_event, + m_logtype_dict_entry, + m_var_dict, + encoded_vars, + var_ids, + original_num_bytes + ); + + logtype_dictionary_id_t logtype_id{cLogtypeDictionaryIdMax}; + m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); + + m_file->write_encoded_msg( + log_event.get_timestamp(), + logtype_id, + encoded_vars, + var_ids, + original_num_bytes + ); + + // TODO deduplicate + // Update segment indices + if (m_file->has_ts_pattern()) { + m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); + m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); + } else { + m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); + m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); + } +} +} // namespace streaming_archive::writer + +#endif // STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP diff --git a/components/core/src/type_utils.hpp b/components/core/src/type_utils.hpp index 53f6e5742..8c68d355c 100644 --- a/components/core/src/type_utils.hpp +++ b/components/core/src/type_utils.hpp @@ -5,6 +5,19 @@ #include #include +/** + * A template-parameterized false-constant which can be used to make static + * assertions conditional based on deduced template parameters + */ +template +[[maybe_unused]] constexpr bool cAlwaysFalse{false}; + +/** + * An empty type which can be used to declare variables conditionally based on + * template parameters + */ +struct EmptyType {}; + /** * Gets the underlying type of the given enum * @tparam T @@ -66,5 +79,4 @@ std::enable_if_t return reinterpret_cast(src); } - #endif // TYPE_UTILS_HPP From f8f587a222323bd775fa60d37fd69514b732451f Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Aug 2023 07:58:04 -0400 Subject: [PATCH 116/121] Address todos: Rename generic_parse_tokens -> deserialize_ir_message and deduplicate update_segment_indices. --- .../src/ffi/ir_stream/decoding_methods.cpp | 19 ++++++------ .../src/ffi/ir_stream/decoding_methods.hpp | 26 +++++++--------- .../core/src/ir/LogEventDeserializer.tpp | 2 +- .../src/streaming_archive/writer/Archive.cpp | 31 +++++++++---------- .../src/streaming_archive/writer/Archive.hpp | 5 +++ .../src/streaming_archive/writer/Archive.tpp | 10 +----- 6 files changed, 44 insertions(+), 49 deletions(-) diff --git a/components/core/src/ffi/ir_stream/decoding_methods.cpp b/components/core/src/ffi/ir_stream/decoding_methods.cpp index 571abca8c..167fc77cd 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.cpp @@ -83,7 +83,7 @@ parse_timestamp(ReaderInterface& reader, encoded_tag_t encoded_tag, epoch_time_m * @return IRErrorCode_Success on success * @return IRErrorCode_Decode_Error if the encoded message cannot be properly * decoded - * @return Same as ffi::ir_stream::generic_parse_tokens + * @return Same as ffi::ir_stream::deserialize_ir_message */ template static IRErrorCode @@ -267,7 +267,7 @@ generic_decode_next_message(ReaderInterface& reader, string& message, epoch_time vector dict_vars; string logtype; if (auto error_code - = generic_parse_tokens(reader, logtype, encoded_vars, dict_vars, timestamp); + = deserialize_ir_message(reader, logtype, encoded_vars, dict_vars, timestamp); IRErrorCode_Success != error_code) { return error_code; @@ -336,12 +336,12 @@ read_metadata_info(ReaderInterface& reader, encoded_tag_t& metadata_type, uint16 } template -auto generic_parse_tokens( +auto deserialize_ir_message( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, - epoch_time_ms_t& timestamp + epoch_time_ms_t& timestamp_or_timestamp_delta ) -> IRErrorCode { encoded_tag_t encoded_tag{cProtocol::Eof}; if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { @@ -386,7 +386,8 @@ auto generic_parse_tokens( if (ErrorCode_Success != reader.try_read_numeric_value(encoded_tag)) { return IRErrorCode_Incomplete_IR; } - if (auto error_code = parse_timestamp(reader, encoded_tag, timestamp); + if (auto error_code + = parse_timestamp(reader, encoded_tag, timestamp_or_timestamp_delta); IRErrorCode_Success != error_code) { return error_code; @@ -483,19 +484,19 @@ namespace eight_byte_encoding { } // namespace eight_byte_encoding // Explicitly declare specializations -template auto generic_parse_tokens( +template auto deserialize_ir_message( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, - epoch_time_ms_t& timestamp + epoch_time_ms_t& timestamp_or_timestamp_delta ) -> IRErrorCode; -template auto generic_parse_tokens( +template auto deserialize_ir_message( ReaderInterface& reader, string& logtype, vector& encoded_vars, vector& dict_vars, - epoch_time_ms_t& timestamp + epoch_time_ms_t& timestamp_or_timestamp_delta ) -> IRErrorCode; } // namespace ffi::ir_stream diff --git a/components/core/src/ffi/ir_stream/decoding_methods.hpp b/components/core/src/ffi/ir_stream/decoding_methods.hpp index 534992d7f..cf4264111 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.hpp +++ b/components/core/src/ffi/ir_stream/decoding_methods.hpp @@ -49,39 +49,37 @@ class DecodingException : public TraceableException { IRErrorCode get_encoding_type(ReaderInterface& reader, bool& is_four_bytes_encoding); /** - * TODO Rename - * Parse logtypes, dictionary variables and encoded variables - * from the next encoded IR message. Returns the parsed tokens by - * reference + * Deserializes an IR message from the given stream * @tparam encoded_variable_t * @param reader - * @param logtype - * @param encoded_vars - * @param dict_vars - * @param timestamp + * @param logtype Returns the logtype + * @param encoded_vars Returns the encoded variables + * @param dict_vars Returns the dictionary variables + * @param timestamp_or_timestamp_delta Returns the timestamp (in the eight-byte + * encoding case) or the timestamp delta (in the four-byte encoding case) * @return IRErrorCode_Success on success * @return IRErrorCode_Corrupted_IR if reader contains invalid IR * @return IRErrorCode_Incomplete_IR if reader doesn't contain enough data * @return IRErrorCode_Eof on reaching the end of the stream */ template -auto generic_parse_tokens( +auto deserialize_ir_message( ReaderInterface& reader, std::string& logtype, std::vector& encoded_vars, std::vector& dict_vars, - epoch_time_ms_t& timestamp + epoch_time_ms_t& timestamp_or_timestamp_delta ) -> IRErrorCode; /** - * Decodes the message consists of the tokens and calls the given methods - * to handle specific components of the message. + * Decodes the IR message calls the given methods to handle each component of + * the message * @tparam encoded_variable_t Type of the encoded variable - * @tparam ConstantHandler Method to handle constants in the logtypes. + * @tparam ConstantHandler Method to handle constants in the logtype. * Signature: (const std::string&, size_t, size_t) -> void * @tparam EncodedIntHandler Method to handle encoded integers. * Signature: (encoded_variable_t) -> void - * @tparam EncodedFloatHandler Method to handle encoded float. + * @tparam EncodedFloatHandler Method to handle encoded floats. * Signature: (encoded_variable_t) -> void * @tparam DictVarHandler Method to handle dictionary variables. * Signature: (const std::string&) -> void diff --git a/components/core/src/ir/LogEventDeserializer.tpp b/components/core/src/ir/LogEventDeserializer.tpp index e71d521d8..3892db72a 100644 --- a/components/core/src/ir/LogEventDeserializer.tpp +++ b/components/core/src/ir/LogEventDeserializer.tpp @@ -74,7 +74,7 @@ auto LogEventDeserializer::deserialize_log_event() std::vector dict_vars; std::vector encoded_vars; - auto ir_error_code = ffi::ir_stream::generic_parse_tokens( + auto ir_error_code = ffi::ir_stream::deserialize_ir_message( m_reader, logtype, encoded_vars, diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index ba09d4b29..4d5f4bd9f 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -259,14 +259,7 @@ namespace streaming_archive::writer { m_file->write_encoded_msg(timestamp, logtype_id, encoded_vars, var_ids, num_uncompressed_bytes); - // Update segment indices - if (m_file->has_ts_pattern()) { - m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); - m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); - } else { - m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); - m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); - } + update_segment_indices(logtype_id, var_ids); } void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, @@ -370,14 +363,7 @@ namespace streaming_archive::writer { m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); m_file->write_encoded_msg(timestamp, logtype_id, m_encoded_vars, m_var_ids, num_uncompressed_bytes); - // Update segment indices - if (m_file->has_ts_pattern()) { - m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); - m_var_ids_in_segment_for_files_with_timestamps.insert_all(m_var_ids); - } else { - m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); - m_var_ids_for_file_with_unassigned_segment.insert(m_var_ids.cbegin(), m_var_ids.cend()); - } + update_segment_indices(logtype_id, m_var_ids); } } @@ -387,6 +373,19 @@ namespace streaming_archive::writer { m_var_dict.write_header_and_flush_to_disk(); } + void Archive::update_segment_indices( + logtype_dictionary_id_t logtype_id, + vector const& var_ids + ) { + if (m_file->has_ts_pattern()) { + m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); + m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); + } else { + m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); + m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); + } + } + void Archive::append_file_contents_to_segment (Segment& segment, ArrayBackedPosIntSet& logtype_ids_in_segment, ArrayBackedPosIntSet& var_ids_in_segment, vector& files_in_segment) { diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index 866789ade..4137b306d 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -209,6 +209,11 @@ namespace streaming_archive { namespace writer { }; // Methods + void update_segment_indices( + logtype_dictionary_id_t logtype_id, + std::vector const& var_ids + ); + /** * Appends the content of the current encoded file to the given segment * @param segment diff --git a/components/core/src/streaming_archive/writer/Archive.tpp b/components/core/src/streaming_archive/writer/Archive.tpp index 9c2c47dcf..afbaa0f96 100644 --- a/components/core/src/streaming_archive/writer/Archive.tpp +++ b/components/core/src/streaming_archive/writer/Archive.tpp @@ -32,15 +32,7 @@ void Archive::write_log_event_ir(ir::LogEvent const& log_eve original_num_bytes ); - // TODO deduplicate - // Update segment indices - if (m_file->has_ts_pattern()) { - m_logtype_ids_in_segment_for_files_with_timestamps.insert(logtype_id); - m_var_ids_in_segment_for_files_with_timestamps.insert_all(var_ids); - } else { - m_logtype_ids_for_file_with_unassigned_segment.insert(logtype_id); - m_var_ids_for_file_with_unassigned_segment.insert(var_ids.cbegin(), var_ids.cend()); - } + update_segment_indices(logtype_id, var_ids); } } // namespace streaming_archive::writer From 48d7b01103362749d7bd4b40658a2c75b06f31ed Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Aug 2023 08:27:50 -0400 Subject: [PATCH 117/121] Add boost-outcome as submodule; LogEventDeserializer: Explicitly specify error returns. --- .gitmodules | 3 +++ components/core/src/ir/LogEventDeserializer.hpp | 13 ++++++++++--- components/core/submodules/boost-outcome | 1 + .../tools/scripts/deps-download/boost-outcome.json | 10 ++++++++++ .../tools/scripts/deps-download/download-all.sh | 1 + 5 files changed, 25 insertions(+), 3 deletions(-) create mode 160000 components/core/submodules/boost-outcome create mode 100644 components/core/tools/scripts/deps-download/boost-outcome.json diff --git a/.gitmodules b/.gitmodules index d48454341..213771efd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "components/core/submodules/yaml-cpp"] path = components/core/submodules/yaml-cpp url = https://github.com/jbeder/yaml-cpp.git +[submodule "components/core/submodules/boost-outcome"] + path = components/core/submodules/boost-outcome + url = git@github.com:boostorg/outcome.git diff --git a/components/core/src/ir/LogEventDeserializer.hpp b/components/core/src/ir/LogEventDeserializer.hpp index dd1e89759..ac071927a 100644 --- a/components/core/src/ir/LogEventDeserializer.hpp +++ b/components/core/src/ir/LogEventDeserializer.hpp @@ -3,7 +3,7 @@ #include -#include +#include #include "../ffi/encoding_methods.hpp" #include "../ReaderInterface.hpp" @@ -41,7 +41,11 @@ class LogEventDeserializer { * Creates a log event deserializer for the given stream * @param reader A reader for the IR stream * @return A result containing the serializer or an error code indicating - * the failure + * the failure: + * - std::errc::result_out_of_range if the IR stream is truncated + * - std::errc::protocol_error if the IR stream is corrupted + * - std::errc::protocol_not_supported if the IR stream contains an + * unsupported metadata format or uses an unsupported version */ static auto create(ReaderInterface& reader) -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; @@ -62,7 +66,10 @@ class LogEventDeserializer { /** * Deserializes a log event from the stream * @return A result containing the log event or an error code indicating - * the failure + * the failure: + * - std::errc::no_message_available on reaching the end of the IR stream + * - std::errc::result_out_of_range if the IR stream is truncated + * - std::errc::result_out_of_range if the IR stream is corrupted */ [[nodiscard]] auto deserialize_log_event() -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; diff --git a/components/core/submodules/boost-outcome b/components/core/submodules/boost-outcome new file mode 160000 index 000000000..39500a331 --- /dev/null +++ b/components/core/submodules/boost-outcome @@ -0,0 +1 @@ +Subproject commit 39500a33117c23596673c1925479c7ff01b602f6 diff --git a/components/core/tools/scripts/deps-download/boost-outcome.json b/components/core/tools/scripts/deps-download/boost-outcome.json new file mode 100644 index 000000000..01e89b394 --- /dev/null +++ b/components/core/tools/scripts/deps-download/boost-outcome.json @@ -0,0 +1,10 @@ +{ + "url": "https://github.com/boostorg/outcome/archive/refs/tags/boost-1.83.0.zip", + "unzip": true, + "targets": [ + { + "source": "outcome-boost-1.83.0", + "destination": "submodules/boost-outcome" + } + ] +} diff --git a/components/core/tools/scripts/deps-download/download-all.sh b/components/core/tools/scripts/deps-download/download-all.sh index 3a6688e5b..07c5a427b 100755 --- a/components/core/tools/scripts/deps-download/download-all.sh +++ b/components/core/tools/scripts/deps-download/download-all.sh @@ -21,6 +21,7 @@ python3 "${script_dir}/download-dep.py" "${script_dir}/antlr4.json" if [ -e "$project_root_dir/.git" ] ; then git submodule update --init --recursive else + python3 "${script_dir}/download-dep.py" "${script_dir}/boost-outcome.json" python3 "${script_dir}/download-dep.py" "${script_dir}/Catch2.json" python3 "${script_dir}/download-dep.py" "${script_dir}/date.json" python3 "${script_dir}/download-dep.py" "${script_dir}/json.json" From 1170333d080b41e6bbd9fd49df6c87ae5a5615d7 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:46:39 -0400 Subject: [PATCH 118/121] Clean-up LogEventDeserializer; Move template methods into cpp with explicit specializations. --- components/core/CMakeLists.txt | 4 ++- components/core/src/clp/FileCompressor.cpp | 1 + ...erializer.tpp => LogEventDeserializer.cpp} | 30 ++++++++++++------- .../core/src/ir/LogEventDeserializer.hpp | 17 ++--------- 4 files changed, 25 insertions(+), 27 deletions(-) rename components/core/src/ir/{LogEventDeserializer.tpp => LogEventDeserializer.cpp} (78%) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 7dc34e654..6c83f1fff 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -250,8 +250,8 @@ set(SOURCE_FILES_clp src/GlobalSQLiteMetadataDB.cpp src/GlobalSQLiteMetadataDB.hpp src/ir/LogEvent.hpp + src/ir/LogEventDeserializer.cpp src/ir/LogEventDeserializer.hpp - src/ir/LogEventDeserializer.tpp src/ir/utils.cpp src/ir/utils.hpp src/LibarchiveFileReader.cpp @@ -790,6 +790,8 @@ set(SOURCE_FILES_unitTest src/Grep.cpp src/Grep.hpp src/ir/LogEvent.hpp + src/ir/LogEventDeserializer.cpp + src/ir/LogEventDeserializer.hpp src/ir/utils.cpp src/ir/utils.hpp src/LibarchiveFileReader.cpp diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index c16e2eebd..2a99de2dd 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -12,6 +12,7 @@ #include // Project headers +#include "../ffi/ir_stream/decoding_methods.hpp" #include "../ir/utils.hpp" #include "../Profiler.hpp" #include "utils.hpp" diff --git a/components/core/src/ir/LogEventDeserializer.tpp b/components/core/src/ir/LogEventDeserializer.cpp similarity index 78% rename from components/core/src/ir/LogEventDeserializer.tpp rename to components/core/src/ir/LogEventDeserializer.cpp index 3892db72a..16ba71ba2 100644 --- a/components/core/src/ir/LogEventDeserializer.tpp +++ b/components/core/src/ir/LogEventDeserializer.cpp @@ -1,12 +1,10 @@ -#ifndef IR_LOGEVENTDESERIALIZER_TPP -#define IR_LOGEVENTDESERIALIZER_TPP +#include "LogEventDeserializer.hpp" #include #include #include "../ffi/ir_stream/decoding_methods.hpp" -#include "LogEventDeserializer.hpp" namespace ir { template @@ -47,7 +45,8 @@ auto LogEventDeserializer::create(ReaderInterface& reader) if constexpr (std::is_same_v) { return LogEventDeserializer{reader}; - } else if constexpr (std::is_same_v) { + } + if constexpr (std::is_same_v) { // Get reference timestamp auto ref_timestamp_iter = metadata_json.find(ffi::ir_stream::cProtocol::Metadata::ReferenceTimestampKey); @@ -61,8 +60,6 @@ auto LogEventDeserializer::create(ReaderInterface& reader) } return LogEventDeserializer{reader, ref_timestamp}; - } else { - static_assert(cAlwaysFalse); } } @@ -96,15 +93,26 @@ auto LogEventDeserializer::deserialize_log_event() ffi::epoch_time_ms_t timestamp{}; if constexpr (std::is_same_v) { timestamp = timestamp_or_timestamp_delta; - } else if constexpr (std::is_same_v) { + } else { // std::is_same_v m_prev_msg_timestamp += timestamp_or_timestamp_delta; timestamp = m_prev_msg_timestamp; - } else { - static_assert(cAlwaysFalse); } return LogEvent{timestamp, logtype, dict_vars, encoded_vars}; } -} // namespace ir -#endif // IR_LOGEVENTDESERIALIZER_TPP +// Explicitly declare template specializations so that we can define the +// template methods in this file +template auto +LogEventDeserializer::create(ReaderInterface& reader) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result< + LogEventDeserializer>; +template auto +LogEventDeserializer::create(ReaderInterface& reader) + -> BOOST_OUTCOME_V2_NAMESPACE::std_result< + LogEventDeserializer>; +template auto LogEventDeserializer::deserialize_log_event() + -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; +template auto LogEventDeserializer::deserialize_log_event() + -> BOOST_OUTCOME_V2_NAMESPACE::std_result>; +} // namespace ir diff --git a/components/core/src/ir/LogEventDeserializer.hpp b/components/core/src/ir/LogEventDeserializer.hpp index ac071927a..e392f0157 100644 --- a/components/core/src/ir/LogEventDeserializer.hpp +++ b/components/core/src/ir/LogEventDeserializer.hpp @@ -23,19 +23,6 @@ namespace ir { template class LogEventDeserializer { public: - // Types - class OperationFailed : public TraceableException { - public: - // Constructors - OperationFailed(ErrorCode error_code, char const* const filename, int line_number) - : TraceableException(error_code, filename, line_number) {} - - // Methods - [[nodiscard]] auto what() const noexcept -> char const* override { - return "ir::LogEventParser operation failed"; - } - }; - // Factory functions /** * Creates a log event deserializer for the given stream @@ -58,6 +45,8 @@ class LogEventDeserializer { LogEventDeserializer(LogEventDeserializer&&) = default; auto operator=(LogEventDeserializer&&) -> LogEventDeserializer& = default; + ~LogEventDeserializer() = default; + // Methods [[nodiscard]] auto get_timestamp_pattern() const -> TimestampPattern const& { return m_timestamp_pattern; @@ -93,6 +82,4 @@ class LogEventDeserializer { }; } // namespace ir -#include "LogEventDeserializer.tpp" - #endif // IR_LOGEVENTDESERIALIZER_HPP From 7ae1e867aa3fae37506008e4cfd87175e41367f3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:59:42 -0400 Subject: [PATCH 119/121] FileCompressor: Move template implementation into cpp with explicit specialization. --- components/core/CMakeLists.txt | 2 - components/core/src/clp/FileCompressor.cpp | 70 ++++++++++++++++++++++ components/core/src/clp/FileCompressor.hpp | 2 - components/core/src/clp/FileCompressor.tpp | 56 ----------------- 4 files changed, 70 insertions(+), 60 deletions(-) delete mode 100644 components/core/src/clp/FileCompressor.tpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 6c83f1fff..8aeb49913 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -181,7 +181,6 @@ set(SOURCE_FILES_clp src/clp/decompression.hpp src/clp/FileCompressor.cpp src/clp/FileCompressor.hpp - src/clp/FileCompressor.tpp src/clp/FileDecompressor.cpp src/clp/FileDecompressor.hpp src/clp/FileToCompress.cpp @@ -704,7 +703,6 @@ set(SOURCE_FILES_unitTest src/clp/decompression.hpp src/clp/FileCompressor.cpp src/clp/FileCompressor.hpp - src/clp/FileCompressor.tpp src/clp/FileDecompressor.cpp src/clp/FileDecompressor.hpp src/clp/FileToCompress.cpp diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 2a99de2dd..e4e414b52 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -412,4 +412,74 @@ namespace clp { return true; } + + template + std::error_code FileCompressor::compress_ir_stream_by_encoding( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive, + LogEventDeserializer& log_event_deserializer + ) { + archive.create_and_open_file(path, group_id, m_uuid_generator(), 0); + + // We assume an IR stream only has one timestamp pattern + auto timestamp_pattern = log_event_deserializer.get_timestamp_pattern(); + archive.change_ts_pattern(×tamp_pattern); + + std::error_code error_code{}; + while (true) { + auto result = log_event_deserializer.deserialize_log_event(); + if (result.has_error()) { + auto error = result.error(); + if (std::errc::no_message_available != error) { + error_code = error; + } + break; + } + + // Split archive/encoded file if necessary before writing the new event + if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { + split_file_and_archive( + archive_user_config, + path, + group_id, + ×tamp_pattern, + archive + ); + } else if (archive.get_file().get_encoded_size_in_bytes() >= target_encoded_file_size) { + split_file(path, group_id, ×tamp_pattern, archive); + } + + archive.write_log_event_ir(result.value()); + } + + close_file_and_append_to_segment(archive); + return error_code; + } + + // Explicitly declare template specializations so that we can define the + // template methods in this file + template std::error_code + FileCompressor::compress_ir_stream_by_encoding( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive, + LogEventDeserializer& log_event_deserializer + ); + template std::error_code + FileCompressor::compress_ir_stream_by_encoding( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + string const& path, + group_id_t group_id, + streaming_archive::writer::Archive& archive, + LogEventDeserializer& log_event_deserializer + ); } diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index d97d20211..7d87e12db 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -133,6 +133,4 @@ namespace clp { }; } -#include "FileCompressor.tpp" - #endif // CLP_FILECOMPRESSOR_HPP diff --git a/components/core/src/clp/FileCompressor.tpp b/components/core/src/clp/FileCompressor.tpp deleted file mode 100644 index c189b9e20..000000000 --- a/components/core/src/clp/FileCompressor.tpp +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef CLP_FILECOMPRESSOR_TPP -#define CLP_FILECOMPRESSOR_TPP - -#include "FileCompressor.hpp" -#include "utils.hpp" - -namespace clp { -template -auto FileCompressor::compress_ir_stream_by_encoding( - size_t target_data_size_of_dicts, - streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, - std::string const& path, - group_id_t group_id, - streaming_archive::writer::Archive& archive, - ir::LogEventDeserializer& log_event_deserializer -) -> std::error_code { - archive.create_and_open_file(path, group_id, m_uuid_generator(), 0); - - // We assume an IR stream only has one timestamp pattern - auto timestamp_pattern = log_event_deserializer.get_timestamp_pattern(); - archive.change_ts_pattern(×tamp_pattern); - - std::error_code error_code{}; - while (true) { - auto result = log_event_deserializer.deserialize_log_event(); - if (result.has_error()) { - auto error = result.error(); - if (std::errc::no_message_available != error) { - error_code = error; - } - break; - } - - // Split archive/encoded file if necessary before writing the new event - if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { - split_file_and_archive( - archive_user_config, - path, - group_id, - ×tamp_pattern, - archive - ); - } else if (archive.get_file().get_encoded_size_in_bytes() >= target_encoded_file_size) { - split_file(path, group_id, ×tamp_pattern, archive); - } - - archive.write_log_event_ir(result.value()); - } - - close_file_and_append_to_segment(archive); - return error_code; -} -} // namespace clp - -#endif From 9e3e6b48a9e7cbbcd091650d59400705843a1162 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Tue, 29 Aug 2023 00:10:02 -0400 Subject: [PATCH 120/121] Archive: Move template implementation into cpp with explicit specialization. --- components/core/CMakeLists.txt | 2 - components/core/src/clp/FileCompressor.cpp | 1 + .../src/streaming_archive/writer/Archive.cpp | 37 ++++++++++++++++++ .../src/streaming_archive/writer/Archive.hpp | 19 +++++---- .../src/streaming_archive/writer/Archive.tpp | 39 ------------------- 5 files changed, 47 insertions(+), 51 deletions(-) delete mode 100644 components/core/src/streaming_archive/writer/Archive.tpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 8aeb49913..7828f819f 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -307,7 +307,6 @@ set(SOURCE_FILES_clp src/streaming_archive/reader/SegmentManager.hpp src/streaming_archive/writer/Archive.cpp src/streaming_archive/writer/Archive.hpp - src/streaming_archive/writer/Archive.tpp src/streaming_archive/writer/File.cpp src/streaming_archive/writer/File.hpp src/streaming_archive/writer/Segment.cpp @@ -846,7 +845,6 @@ set(SOURCE_FILES_unitTest src/streaming_archive/reader/SegmentManager.hpp src/streaming_archive/writer/Archive.cpp src/streaming_archive/writer/Archive.hpp - src/streaming_archive/writer/Archive.tpp src/streaming_archive/writer/File.cpp src/streaming_archive/writer/File.hpp src/streaming_archive/writer/Segment.cpp diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index e4e414b52..88db676ee 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -6,6 +6,7 @@ #include // Boost libraries +#include #include // libarchive diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index 4d5f4bd9f..415d599e4 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -367,6 +367,34 @@ namespace streaming_archive::writer { } } + template + void Archive::write_log_event_ir(ir::LogEvent const& log_event) { + vector encoded_vars; + vector var_ids; + size_t original_num_bytes{0}; + EncodedVariableInterpreter::encode_and_add_to_dictionary( + log_event, + m_logtype_dict_entry, + m_var_dict, + encoded_vars, + var_ids, + original_num_bytes + ); + + logtype_dictionary_id_t logtype_id{cLogtypeDictionaryIdMax}; + m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); + + m_file->write_encoded_msg( + log_event.get_timestamp(), + logtype_id, + encoded_vars, + var_ids, + original_num_bytes + ); + + update_segment_indices(logtype_id, var_ids); + } + void Archive::write_dir_snapshot () { // Flush dictionaries m_logtype_dict.write_header_and_flush_to_disk(); @@ -521,4 +549,13 @@ namespace streaming_archive::writer { std::cout << json_msg.dump(-1, ' ', true, nlohmann::json::error_handler_t::ignore) << std::endl; } } + + // Explicitly declare template specializations so that we can define the + // template methods in this file + template void Archive::write_log_event_ir( + ir::LogEvent const& log_event + ); + template void Archive::write_log_event_ir( + ir::LogEvent const& log_event + ); } diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index 4137b306d..64569a9f6 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -130,14 +130,6 @@ namespace streaming_archive { namespace writer { * @throw FileWriter::OperationFailed if any write fails */ void write_msg (epochtime_t timestamp, const std::string& message, size_t num_uncompressed_bytes); - /** - * Writes an IR log event to the current encoded file - * @tparam encoded_variable_t The type of the encoded variables in the - * log event - * @param log_event - */ - template - void write_log_event_ir(ir::LogEvent const& log_event); /** * Encodes and writes a message to the given file using schema file * @param file @@ -149,6 +141,15 @@ namespace streaming_archive { namespace writer { */ void write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, bool has_delimiter, bool has_timestamp); + /** + * Writes an IR log event to the current encoded file + * @tparam encoded_variable_t The type of the encoded variables in the + * log event + * @param log_event + */ + template + void write_log_event_ir(ir::LogEvent const& log_event); + /** * Writes snapshot of archive to disk including metadata of all files and new dictionary entries * @throw FileWriter::OperationFailed if failed to write or flush dictionaries @@ -312,6 +313,4 @@ namespace streaming_archive { namespace writer { }; } } -#include "Archive.tpp" - #endif // STREAMING_ARCHIVE_WRITER_ARCHIVE_HPP diff --git a/components/core/src/streaming_archive/writer/Archive.tpp b/components/core/src/streaming_archive/writer/Archive.tpp deleted file mode 100644 index afbaa0f96..000000000 --- a/components/core/src/streaming_archive/writer/Archive.tpp +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP -#define STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP - -#include - -#include "../../EncodedVariableInterpreter.hpp" -#include "../../ir/LogEvent.hpp" - -namespace streaming_archive::writer { -template -void Archive::write_log_event_ir(ir::LogEvent const& log_event) { - std::vector encoded_vars; - std::vector var_ids; - size_t original_num_bytes{0}; - EncodedVariableInterpreter::encode_and_add_to_dictionary( - log_event, - m_logtype_dict_entry, - m_var_dict, - encoded_vars, - var_ids, - original_num_bytes - ); - - logtype_dictionary_id_t logtype_id{cLogtypeDictionaryIdMax}; - m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); - - m_file->write_encoded_msg( - log_event.get_timestamp(), - logtype_id, - encoded_vars, - var_ids, - original_num_bytes - ); - - update_segment_indices(logtype_id, var_ids); -} -} // namespace streaming_archive::writer - -#endif // STREAMING_ARCHIVE_WRITER_ARCHIVE_TPP From 8cbefa402e7f1cb7143b4ca2d9aa1338e179a619 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Tue, 29 Aug 2023 00:24:18 -0400 Subject: [PATCH 121/121] Remove unnecessary cAlwaysFalse const. --- components/core/src/type_utils.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/components/core/src/type_utils.hpp b/components/core/src/type_utils.hpp index 8c68d355c..b05f58524 100644 --- a/components/core/src/type_utils.hpp +++ b/components/core/src/type_utils.hpp @@ -5,13 +5,6 @@ #include #include -/** - * A template-parameterized false-constant which can be used to make static - * assertions conditional based on deduced template parameters - */ -template -[[maybe_unused]] constexpr bool cAlwaysFalse{false}; - /** * An empty type which can be used to declare variables conditionally based on * template parameters