gaia-platform · mihirj1993 · Dec 7, 2021 · Dec 8, 2021 · Feb 3, 2022 · Dec 7, 2021
diff --git a/production/db/core/CMakeLists.txt b/production/db/core/CMakeLists.txt
@@ -132,7 +132,8 @@ endif()
 add_library(gaia_db_persistence STATIC
   src/log_file.cpp
   src/async_disk_writer.cpp
-  src/async_write_batch.cpp)
+  src/async_write_batch.cpp
+  src/log_io.cpp)
 configure_gaia_target(gaia_db_persistence)
 target_include_directories(gaia_db_persistence PRIVATE
   "${GAIA_DB_CORE_PUBLIC_INCLUDES}"

diff --git a/production/db/core/inc/async_disk_writer.hpp b/production/db/core/inc/async_disk_writer.hpp
@@ -37,7 +37,7 @@ namespace persistence
 class async_disk_writer_t
 {
 public:
-    async_disk_writer_t(int validate_flushed_batch_efd, int signal_checkpoint_efd);
+    async_disk_writer_t(int validate_flushed_batch_eventfd, int signal_checkpoint_eventfd);
 
     ~async_disk_writer_t();
 
@@ -98,43 +98,43 @@ class async_disk_writer_t
     /**
      * Copy any temporary writes (which don't exist in gaia shared memory) into the metadata buffer.
      */
-    unsigned char* copy_into_metadata_buffer(void* source, size_t size, int file_fd);
+    unsigned char* copy_into_metadata_buffer(const void* source, size_t size, int file_fd);
 
     /**
      * Perform maintenance actions on in_flight batch after all of its IO entries have been processed.
      */
     void perform_post_completion_maintenance();
 
-    void add_decisions_to_batch(decision_list_t& decisions);
+    void add_decisions_to_batch(const decision_list_t& decisions);
 
     /**
      * For each commit ts, keep track of the eventfd which the session thread blocks on. Once the txn
      * has been made durable, this eventfd is written to so that the session thread can make progress and
      * return commit decision to the client.
      */
-    void map_commit_ts_to_session_decision_efd(gaia_txn_id_t commit_ts, int session_decision_efd);
+    void map_commit_ts_to_session_decision_eventfd(gaia_txn_id_t commit_ts, int session_decision_eventfd);
 
 private:
     // Reserve slots in the in_progress batch to be able to append additional operations to it (before it gets submitted to the kernel)
     static constexpr size_t c_submit_batch_sqe_count = 3;
     static constexpr size_t c_single_submission_entry_count = 1;
     static constexpr size_t c_async_batch_size = 32;
     static constexpr size_t c_max_iovec_array_size_bytes = IOV_MAX * sizeof(iovec);
-    static inline eventfd_t c_default_flush_efd_value = 1;
-    static inline iovec c_default_iov = {static_cast<void*>(&c_default_flush_efd_value), sizeof(eventfd_t)};
+    static inline eventfd_t c_default_flush_eventfd_value = 1;
+    static inline iovec c_default_iov = {static_cast<void*>(&c_default_flush_eventfd_value), sizeof(eventfd_t)};
 
     // eventfd to signal that a batch flush has completed.
     // Used to block new writes to disk when a batch is already getting flushed.
-    static inline int s_flush_efd = -1;
+    static inline int s_flush_eventfd = -1;
 
     // eventfd to signal that the IO results belonging to a batch are ready to be validated.
-    int m_validate_flush_efd = -1;
+    int m_validate_flush_eventfd = -1;
 
     // eventfd to signal that a file is ready to be checkpointed.
-    int m_signal_checkpoint_efd = -1;
+    int m_signal_checkpoint_eventfd = -1;
 
     // Keep track of session threads to unblock.
-    std::unordered_map<gaia_txn_id_t, int> m_ts_to_session_decision_fd_map;
+    std::unordered_map<gaia_txn_id_t, int> m_ts_to_session_decision_eventfd_map;
 
     // Writes are batched and we maintain two buffers so that writes to a buffer
     // can still proceed when the other buffer is getting flushed to disk.

diff --git a/production/db/core/inc/db_helpers.hpp b/production/db/core/inc/db_helpers.hpp
@@ -132,6 +132,11 @@ inline void allocate_object(
     gaia_offset_t object_offset = chunk_manager->allocate(size + c_db_object_header_size);
     if (object_offset == c_invalid_gaia_offset)
     {
+        if (gaia::db::get_mapped_log()->data()->chunk_count == c_max_chunks_per_txn)
+        {
+            throw memory_allocation_error_internal("Maximum number of chunks for this transaction has been reached.");
+        }
+
         if (chunk_manager->initialized())
         {
             // The current chunk is out of memory, so retire it and allocate a new chunk.
@@ -159,6 +164,9 @@ inline void allocate_object(
         // on the server in case we crash.
         gaia::db::get_mapped_log()->data()->current_chunk = new_chunk_offset;
 
+        auto& chunk = gaia::db::get_mapped_log()->data()->chunks[gaia::db::get_mapped_log()->data()->chunk_count++];
+        chunk = static_cast<size_t>(new_chunk_offset);
+
         // Allocate from new chunk.
         object_offset = chunk_manager->allocate(size + c_db_object_header_size);
     }

diff --git a/production/db/core/inc/db_internal_types.hpp b/production/db/core/inc/db_internal_types.hpp
@@ -83,8 +83,13 @@ constexpr size_t c_max_locators = (1ULL << 32) - 1;
 // similarly optimized by substituting locators for gaia_ids.
 constexpr size_t c_hash_buckets = 1ULL << 20;
 
-// This is arbitrary, but we need to keep txn logs to a reasonable size.
-constexpr size_t c_max_log_records = 1ULL << 20;
+// Track maximum number of new chunks (apart from the one that the txn is already using)
+// that can be allocated per transaction.
+// This sets an upper bound on txn size: 32MB < txn_size < 36MB
+constexpr size_t c_max_chunks_per_txn = 8;
+
+// 8 chunks can hold up to 8 * (2^16 - 2^8) = 522240 64B objects,
+constexpr size_t c_max_log_records = 522240;
 
 // This is an array of offsets in the data segment corresponding to object
 // versions, where each array index is referred to as a "locator."
@@ -108,6 +113,8 @@ struct txn_log_t
     // convenient place for shared state between the client and server.
     memory_manager::chunk_offset_t current_chunk;
     size_t record_count;
+    int session_decision_eventfd;
+    size_t chunk_count;
 
     struct log_record_t
     {
@@ -135,6 +142,7 @@ struct txn_log_t
     };
 
     log_record_t log_records[c_max_log_records];
+    gaia_offset_t chunks[c_max_chunks_per_txn];
 
     friend std::ostream& operator<<(std::ostream& os, const txn_log_t& l)
     {

diff --git a/production/db/core/inc/db_server.hpp b/production/db/core/inc/db_server.hpp
@@ -21,6 +21,7 @@
 #include "gaia_internal/common/generator_iterator.hpp"
 
 #include "db_internal_types.hpp"
+#include "log_io.hpp"
 #include "mapped_data.hpp"
 #include "memory_manager.hpp"
 #include "messages_generated.h"
@@ -114,6 +115,12 @@ class server_t
 private:
     static inline server_config_t s_server_conf{};
 
+    // TODO: Delete this once recovery/checkpointing implementation is in.
+    static inline bool c_use_gaia_log_implementation = false;
+
+    // TODO: Make configurable.
+    static constexpr int64_t c_txn_group_timeout_us = 100;
+
     // This is arbitrary but seems like a reasonable starting point (pending benchmarks).
     static constexpr size_t c_stream_batch_size{1ULL << 10};
 
@@ -128,6 +135,15 @@ class server_t
     static inline int s_server_shutdown_eventfd = -1;
     static inline int s_listening_socket = -1;
 
+    // Signals the log writer thread to persist txn updates.
+    static inline int s_signal_log_write_eventfd = -1;
+
+    // Signals the log writer thread to persist txn decisions.
+    static inline int s_signal_decision_eventfd = -1;
+
+    // Signals the checkpointing thread to merge log file updates into the LSM store.
+    static inline int s_signal_checkpoint_log_eventfd = -1;
+
     // These thread objects are owned by the client dispatch thread.
     static inline std::vector<std::thread> s_session_threads{};
 
@@ -137,6 +153,7 @@ class server_t
     static inline mapped_data_t<id_index_t> s_shared_id_index{};
     static inline index::indexes_t s_global_indexes{};
     static inline std::unique_ptr<persistent_store_manager> s_persistent_store{};
+    static inline std::unique_ptr<persistence::log_handler_t> s_log_handler{};
 
     // These fields have transaction lifetime.
     thread_local static inline gaia_txn_id_t s_txn_id = c_invalid_gaia_txn_id;
@@ -155,6 +172,11 @@ class server_t
     thread_local static inline bool s_session_shutdown = false;
     thread_local static inline int s_session_shutdown_eventfd = -1;
 
+    thread_local static inline int s_session_decision_eventfd = -1;
+
+    // Signal to persistence thread that a batch is ready to be validated.
+    static inline int s_validate_persistence_batch_eventfd = -1;
+
     // These thread objects are owned by the session thread that created them.
     thread_local static inline std::vector<std::thread> s_session_owned_threads{};
 
@@ -242,6 +264,12 @@ class server_t
     // The current thread's index in `s_safe_ts_per_thread_entries`.
     thread_local static inline size_t s_safe_ts_index{c_invalid_safe_ts_index};
 
+    // Keep track of the last txn that has been submitted to the async_disk_writer.
+    static inline std::atomic<gaia_txn_id_t> s_last_queued_commit_ts_upper_bound = c_invalid_gaia_txn_id;
+
+    // Keep a track of undecided txns submitted to the async_disk_writer.
+    static inline std::set<gaia_txn_id_t> s_seen_and_undecided_txn_set{};
+
 private:
     // Returns the current value of the given watermark.
     static inline gaia_txn_id_t get_watermark(watermark_type_t watermark_type)
@@ -409,6 +437,14 @@ class server_t
 
     static void client_dispatch_handler(const std::string& socket_name);
 
+    static void log_writer_handler();
+
+    static void write_to_persistent_log(int64_t txn_group_timeout_us, bool sync_writes = false);
+
+    static void recover_persistent_log();
+
+    static void flush_all_pending_writes();
+
     static void session_handler(int session_socket);
 
     static std::pair<int, int> get_stream_socket_pair();

diff --git a/production/db/core/inc/log_file.hpp b/production/db/core/inc/log_file.hpp
@@ -34,12 +34,12 @@ class log_file_t
     /**
      * Obtain offset to write the next log record at.
      */
-    size_t get_current_offset();
+    file_offset_t get_current_offset();
 
     /**
      * Get remaining space in persistent log file.
      */
-    size_t get_remaining_bytes_count(size_t record_size);
+    size_t get_bytes_remaining_after_append(size_t record_size);
 
     /**
      * Allocate space in persistent log file.
@@ -48,10 +48,15 @@ class log_file_t
 
     int get_file_fd();
 
+    /**
+     * Obtain sequence number for the file.
+     */
+    file_sequence_t get_file_sequence();
+
 private:
     size_t m_file_size;
     file_sequence_t m_file_seq;
-    size_t m_current_offset;
+    file_offset_t m_current_offset;
     std::string m_dir_name;
     int m_dir_fd;
     int m_file_fd;

diff --git a/production/db/core/inc/log_io.hpp b/production/db/core/inc/log_io.hpp
@@ -0,0 +1,109 @@
+/////////////////////////////////////////////
+// Copyright (c) Gaia Platform LLC
+// All rights reserved.
+/////////////////////////////////////////////
+
+#pragma once
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <cstddef>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "gaia/common.hpp"
+
+#include "gaia_internal/db/db_object.hpp"
+
+#include "async_disk_writer.hpp"
+#include "db_internal_types.hpp"
+#include "log_file.hpp"
+#include "memory_manager.hpp"
+
+namespace gaia
+{
+namespace db
+{
+namespace persistence
+{
+
+/*
+ * Fill the record_header.crc field with CRC_INITIAL_VALUE when
+ * computing the checksum: crc32c is vulnerable to 0-prefixing,
+ * so we make sure the initial bytes are non-zero.
+ */
+static constexpr crc32_t c_crc_initial_value = ((uint32_t)-1);
+
+class log_handler_t
+{
+public:
+    explicit log_handler_t(const std::string& directory_path);
+    ~log_handler_t();
+    void open_for_writes(int validate_flushed_batch_eventfd, int signal_checkpoint_eventfd);
+
+    /**
+     * Allocate space in the log file and return starting offset of allocation.
+     */
+    file_offset_t allocate_log_space(size_t payload_size);
+
+    /**
+     * Create a log record which stores txn information.
+     */
+    void create_txn_record(
+        gaia_txn_id_t commit_ts,
+        record_type_t type,
+        std::vector<gaia_offset_t>& object_offsets,
+        std::vector<gaia::common::gaia_id_t>& deleted_ids);
+
+    /**
+     * Process the in memory txn_log and submit the processed writes (generated log records) to the async_disk_writer.
+     */
+    void process_txn_log_and_write(int txn_log_fd, gaia_txn_id_t commit_ts);
+
+    /**
+     * Create a log record which stores decisions for one or more txns.
+     */
+    void create_decision_record(const decision_list_t& txn_decisions);
+
+    /**
+     * Submit async_disk_writer's internal I/O request queue to the kernel for processing.
+     */
+    void submit_writes(bool sync);
+
+    /**
+     * Validate the result of I/O calls submitted to the kernel for processing.
+     */
+    void validate_flushed_batch();
+
+    /**
+     * Track the session_decision_eventfd for each commit_ts; txn_commit() will only return once 
+     * session_decision_eventfd is written to by the log_writer thread - signifying that the txn decision
+     * has been persisted.
+     */
+    void register_commit_ts_for_session_notification(gaia_txn_id_t commit_ts, int session_decision_eventfd);
+
+private:
+    // TODO: Make log file size configurable.
+    static constexpr uint64_t c_file_size = 4 * 1024 * 1024;
+    static constexpr std::string_view c_gaia_wal_dir_name = "/wal_dir";
+    static constexpr int c_gaia_wal_dir_permissions = 0755;
+    static inline std::string s_wal_dir_path{};
+    static inline int s_dir_fd = -1;
+
+    // Log file sequence starts from 1.
+    static inline std::atomic<file_sequence_t::value_type> s_file_num = 1;
+
+    // Keep track of the current log file.
+    std::unique_ptr<log_file_t> m_current_file;
+
+    std::unique_ptr<async_disk_writer_t> m_async_disk_writer;
+};
+
+} // namespace persistence
+} // namespace db
+} // namespace gaia