/*
 *
 * Copyright 2015 gRPC authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

// Implements an efficient in-memory log, optimized for multiple writers and
// a single reader. Available log space is divided up in blocks of
// CENSUS_LOG_2_MAX_RECORD_SIZE bytes. A block can be in one of the following
// three data structures:
// - Free blocks (free_block_list)
// - Blocks with unread data (dirty_block_list)
// - Blocks currently attached to cores (core_local_blocks[])
//
// census_log_start_write() moves a block from core_local_blocks[] to the end of
// dirty_block_list when block:
// - is out-of-space OR
// - has an incomplete record (an incomplete record occurs when a thread calls
//   census_log_start_write() and is context-switched before calling
//   census_log_end_write()
// So, blocks in dirty_block_list are ordered, from oldest to newest, by the
// time when block is detached from the core.
//
// census_log_read_next() first iterates over dirty_block_list and then
// core_local_blocks[]. It moves completely read blocks from dirty_block_list
// to free_block_list. Blocks in core_local_blocks[] are not freed, even when
// completely read.
//
// If the log is configured to discard old records and free_block_list is empty,
// census_log_start_write() iterates over dirty_block_list to allocate a
// new block. It moves the oldest available block (no pending read/write) to
// core_local_blocks[].
//
// core_local_block_struct is used to implement a map from core id to the block
// associated with that core. This mapping is advisory. It is possible that the
// block returned by this mapping is no longer associated with that core. This
// mapping is updated, lazily, by census_log_start_write().
//
// Locking in block struct:
//
// Exclusive g_log.lock must be held before calling any functions operating on
// block structs except census_log_start_write() and census_log_end_write().
//
// Writes to a block are serialized via writer_lock. census_log_start_write()
// acquires this lock and census_log_end_write() releases it. On failure to
// acquire the lock, writer allocates a new block for the current core and
// updates core_local_block accordingly.
//
// Simultaneous read and write access is allowed. Readers can safely read up to
// committed bytes (bytes_committed).
//
// reader_lock protects the block, currently being read, from getting recycled.
// start_read() acquires reader_lock and end_read() releases the lock.
//
// Read/write access to a block is disabled via try_disable_access(). It returns
// with both writer_lock and reader_lock held. These locks are subsequently
// released by enable_access() to enable access to the block.
//
// A note on naming: Most function/struct names are prepended by cl_
// (shorthand for census_log). Further, functions that manipulate structures
// include the name of the structure, which will be passed as the first
// argument. E.g. cl_block_initialize() will initialize a cl_block.

#include "src/core/ext/census/mlog.h"
#include <grpc/support/alloc.h>
#include <grpc/support/atm.h>
#include <grpc/support/cpu.h>
#include <grpc/support/log.h>
#include <grpc/support/sync.h>
#include <grpc/support/useful.h>
#include <stdbool.h>
#include <string.h>

// End of platform specific code

typedef struct census_log_block_list_struct {
  struct census_log_block_list_struct* next;
  struct census_log_block_list_struct* prev;
  struct census_log_block* block;
} cl_block_list_struct;

typedef struct census_log_block {
  // Pointer to underlying buffer.
  char* buffer;
  gpr_atm writer_lock;
  gpr_atm reader_lock;
  // Keeps completely written bytes. Declared atomic because accessed
  // simultaneously by reader and writer.
  gpr_atm bytes_committed;
  // Bytes already read.
  size_t bytes_read;
  // Links for list.
  cl_block_list_struct link;
// We want this structure to be cacheline aligned. We assume the following
// sizes for the various parts on 32/64bit systems:
// type                 32b size    64b size
// char*                   4           8
// 3x gpr_atm             12          24
// size_t                  4           8
// cl_block_list_struct   12          24
// TOTAL                  32          64
//
// Depending on the size of our cacheline and the architecture, we
// selectively add char buffering to this structure. The size is checked
// via assert in census_log_initialize().
#if defined(GPR_ARCH_64)
#define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 64)
#else
#if defined(GPR_ARCH_32)
#define CL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 32)
#else
#error "Unknown architecture"
#endif
#endif
#if CL_BLOCK_PAD_SIZE > 0
  char padding[CL_BLOCK_PAD_SIZE];
#endif
} cl_block;

// A list of cl_blocks, doubly-linked through cl_block::link.
typedef struct census_log_block_list {
  int32_t count;            // Number of items in list.
  cl_block_list_struct ht;  // head/tail of linked list.
} cl_block_list;

// Cacheline aligned block pointers to avoid false sharing. Block pointer must
// be initialized via set_block(), before calling other functions
typedef struct census_log_core_local_block {
  gpr_atm block;
// Ensure cachline alignment: we assume sizeof(gpr_atm) == 4 or 8
#if defined(GPR_ARCH_64)
#define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 8)
#else
#if defined(GPR_ARCH_32)
#define CL_CORE_LOCAL_BLOCK_PAD_SIZE (GPR_CACHELINE_SIZE - 4)
#else
#error "Unknown architecture"
#endif
#endif
#if CL_CORE_LOCAL_BLOCK_PAD_SIZE > 0
  char padding[CL_CORE_LOCAL_BLOCK_PAD_SIZE];
#endif
} cl_core_local_block;

struct census_log {
  int discard_old_records;
  // Number of cores (aka hardware-contexts)
  unsigned num_cores;
  // number of CENSUS_LOG_2_MAX_RECORD_SIZE blocks in log
  uint32_t num_blocks;
  cl_block* blocks;                        // Block metadata.
  cl_core_local_block* core_local_blocks;  // Keeps core to block mappings.
  gpr_mu lock;
  int initialized;  // has log been initialized?
  // Keeps the state of the reader iterator. A value of 0 indicates that
  // iterator has reached the end. census_log_init_reader() resets the value
  // to num_core to restart iteration.
  uint32_t read_iterator_state;
  // Points to the block being read. If non-NULL, the block is locked for
  // reading(block_being_read_->reader_lock is held).
  cl_block* block_being_read;
  char* buffer;
  cl_block_list free_block_list;
  cl_block_list dirty_block_list;
  gpr_atm out_of_space_count;
};

// Single internal log.
static struct census_log g_log;

// Functions that operate on an atomic memory location used as a lock.

// Returns non-zero if lock is acquired.
static int cl_try_lock(gpr_atm* lock) { return gpr_atm_acq_cas(lock, 0, 1); }

static void cl_unlock(gpr_atm* lock) { gpr_atm_rel_store(lock, 0); }

// Functions that operate on cl_core_local_block's.

static void cl_core_local_block_set_block(cl_core_local_block* clb,
                                          cl_block* block) {
  gpr_atm_rel_store(&clb->block, (gpr_atm)block);
}

static cl_block* cl_core_local_block_get_block(cl_core_local_block* clb) {
  return (cl_block*)gpr_atm_acq_load(&clb->block);
}

// Functions that operate on cl_block_list_struct's.

static void cl_block_list_struct_initialize(cl_block_list_struct* bls,
                                            cl_block* block) {
  bls->next = bls->prev = bls;
  bls->block = block;
}

// Functions that operate on cl_block_list's.

static void cl_block_list_initialize(cl_block_list* list) {
  list->count = 0;
  cl_block_list_struct_initialize(&list->ht, NULL);
}

// Returns head of *this, or NULL if empty.
static cl_block* cl_block_list_head(cl_block_list* list) {
  return list->ht.next->block;
}

// Insert element *e after *pos.
static void cl_block_list_insert(cl_block_list* list, cl_block_list_struct* pos,
                                 cl_block_list_struct* e) {
  list->count++;
  e->next = pos->next;
  e->prev = pos;
  e->next->prev = e;
  e->prev->next = e;
}

// Insert block at the head of the list
static void cl_block_list_insert_at_head(cl_block_list* list, cl_block* block) {
  cl_block_list_insert(list, &list->ht, &block->link);
}

// Insert block at the tail of the list.
static void cl_block_list_insert_at_tail(cl_block_list* list, cl_block* block) {
  cl_block_list_insert(list, list->ht.prev, &block->link);
}

// Removes block *b. Requires *b be in the list.
static void cl_block_list_remove(cl_block_list* list, cl_block* b) {
  list->count--;
  b->link.next->prev = b->link.prev;
  b->link.prev->next = b->link.next;
}

// Functions that operate on cl_block's

static void cl_block_initialize(cl_block* block, char* buffer) {
  block->buffer = buffer;
  gpr_atm_rel_store(&block->writer_lock, 0);
  gpr_atm_rel_store(&block->reader_lock, 0);
  gpr_atm_rel_store(&block->bytes_committed, 0);
  block->bytes_read = 0;
  cl_block_list_struct_initialize(&block->link, block);
}

// Guards against exposing partially written buffer to the reader.
static void cl_block_set_bytes_committed(cl_block* block,
                                         size_t bytes_committed) {
  gpr_atm_rel_store(&block->bytes_committed, (gpr_atm)bytes_committed);
}

static size_t cl_block_get_bytes_committed(cl_block* block) {
  return (size_t)gpr_atm_acq_load(&block->bytes_committed);
}

// Tries to disable future read/write access to this block. Succeeds if:
// - no in-progress write AND
// - no in-progress read AND
// - 'discard_data' set to true OR no unread data
// On success, clears the block state and returns with writer_lock_ and
// reader_lock_ held. These locks are released by a subsequent
// cl_block_access_enable() call.
static bool cl_block_try_disable_access(cl_block* block, int discard_data) {
  if (!cl_try_lock(&block->writer_lock)) {
    return false;
  }
  if (!cl_try_lock(&block->reader_lock)) {
    cl_unlock(&block->writer_lock);
    return false;
  }
  if (!discard_data &&
      (block->bytes_read != cl_block_get_bytes_committed(block))) {
    cl_unlock(&block->reader_lock);
    cl_unlock(&block->writer_lock);
    return false;
  }
  cl_block_set_bytes_committed(block, 0);
  block->bytes_read = 0;
  return true;
}

static void cl_block_enable_access(cl_block* block) {
  cl_unlock(&block->reader_lock);
  cl_unlock(&block->writer_lock);
}

// Returns with writer_lock held.
static void* cl_block_start_write(cl_block* block, size_t size) {
  if (!cl_try_lock(&block->writer_lock)) {
    return NULL;
  }
  size_t bytes_committed = cl_block_get_bytes_committed(block);
  if (bytes_committed + size > CENSUS_LOG_MAX_RECORD_SIZE) {
    cl_unlock(&block->writer_lock);
    return NULL;
  }
  return block->buffer + bytes_committed;
}

// Releases writer_lock and increments committed bytes by 'bytes_written'.
// 'bytes_written' must be <= 'size' specified in the corresponding
// StartWrite() call. This function is thread-safe.
static void cl_block_end_write(cl_block* block, size_t bytes_written) {
  cl_block_set_bytes_committed(
      block, cl_block_get_bytes_committed(block) + bytes_written);
  cl_unlock(&block->writer_lock);
}

// Returns a pointer to the first unread byte in buffer. The number of bytes
// available are returned in 'bytes_available'. Acquires reader lock that is
// released by a subsequent cl_block_end_read() call. Returns NULL if:
// - read in progress
// - no data available
static void* cl_block_start_read(cl_block* block, size_t* bytes_available) {
  if (!cl_try_lock(&block->reader_lock)) {
    return NULL;
  }
  // bytes_committed may change from under us. Use bytes_available to update
  // bytes_read below.
  size_t bytes_committed = cl_block_get_bytes_committed(block);
  GPR_ASSERT(bytes_committed >= block->bytes_read);
  *bytes_available = bytes_committed - block->bytes_read;
  if (*bytes_available == 0) {
    cl_unlock(&block->reader_lock);
    return NULL;
  }
  void* record = block->buffer + block->bytes_read;
  block->bytes_read += *bytes_available;
  return record;
}

static void cl_block_end_read(cl_block* block) {
  cl_unlock(&block->reader_lock);
}

// Internal functions operating on g_log

// Allocates a new free block (or recycles an available dirty block if log is
// configured to discard old records). Returns NULL if out-of-space.
static cl_block* cl_allocate_block(void) {
  cl_block* block = cl_block_list_head(&g_log.free_block_list);
  if (block != NULL) {
    cl_block_list_remove(&g_log.free_block_list, block);
    return block;
  }
  if (!g_log.discard_old_records) {
    // No free block and log is configured to keep old records.
    return NULL;
  }
  // Recycle dirty block. Start from the oldest.
  for (block = cl_block_list_head(&g_log.dirty_block_list); block != NULL;
       block = block->link.next->block) {
    if (cl_block_try_disable_access(block, 1 /* discard data */)) {
      cl_block_list_remove(&g_log.dirty_block_list, block);
      return block;
    }
  }
  return NULL;
}

// Allocates a new block and updates core id => block mapping. 'old_block'
// points to the block that the caller thinks is attached to
// 'core_id'. 'old_block' may be NULL. Returns true if:
// - allocated a new block OR
// - 'core_id' => 'old_block' mapping changed (another thread allocated a
//   block before lock was acquired).
static bool cl_allocate_core_local_block(uint32_t core_id,
                                         cl_block* old_block) {
  // Now that we have the lock, check if core-local mapping has changed.
  cl_core_local_block* core_local_block = &g_log.core_local_blocks[core_id];
  cl_block* block = cl_core_local_block_get_block(core_local_block);
  if ((block != NULL) && (block != old_block)) {
    return true;
  }
  if (block != NULL) {
    cl_core_local_block_set_block(core_local_block, NULL);
    cl_block_list_insert_at_tail(&g_log.dirty_block_list, block);
  }
  block = cl_allocate_block();
  if (block == NULL) {
    return false;
  }
  cl_core_local_block_set_block(core_local_block, block);
  cl_block_enable_access(block);
  return true;
}

static cl_block* cl_get_block(void* record) {
  uintptr_t p = (uintptr_t)((char*)record - g_log.buffer);
  uintptr_t index = p >> CENSUS_LOG_2_MAX_RECORD_SIZE;
  return &g_log.blocks[index];
}

// Gets the next block to read and tries to free 'prev' block (if not NULL).
// Returns NULL if reached the end.
static cl_block* cl_next_block_to_read(cl_block* prev) {
  cl_block* block = NULL;
  if (g_log.read_iterator_state == g_log.num_cores) {
    // We are traversing dirty list; find the next dirty block.
    if (prev != NULL) {
      // Try to free the previous block if there is no unread data. This
      // block
      // may have unread data if previously incomplete record completed
      // between
      // read_next() calls.
      block = prev->link.next->block;
      if (cl_block_try_disable_access(prev, 0 /* do not discard data */)) {
        cl_block_list_remove(&g_log.dirty_block_list, prev);
        cl_block_list_insert_at_head(&g_log.free_block_list, prev);
      }
    } else {
      block = cl_block_list_head(&g_log.dirty_block_list);
    }
    if (block != NULL) {
      return block;
    }
    // We are done with the dirty list; moving on to core-local blocks.
  }
  while (g_log.read_iterator_state > 0) {
    g_log.read_iterator_state--;
    block = cl_core_local_block_get_block(
        &g_log.core_local_blocks[g_log.read_iterator_state]);
    if (block != NULL) {
      return block;
    }
  }
  return NULL;
}

#define CL_LOG_2_MB 20  // 2^20 = 1MB

// External functions: primary stats_log interface
void census_log_initialize(size_t size_in_mb, int discard_old_records) {
  // Check cacheline alignment.
  GPR_ASSERT(sizeof(cl_block) % GPR_CACHELINE_SIZE == 0);
  GPR_ASSERT(sizeof(cl_core_local_block) % GPR_CACHELINE_SIZE == 0);
  GPR_ASSERT(!g_log.initialized);
  g_log.discard_old_records = discard_old_records;
  g_log.num_cores = gpr_cpu_num_cores();
  // Ensure that we will not get any overflow in calaculating num_blocks
  GPR_ASSERT(CL_LOG_2_MB >= CENSUS_LOG_2_MAX_RECORD_SIZE);
  GPR_ASSERT(size_in_mb < 1000);
  // Ensure at least 2x as many blocks as there are cores.
  g_log.num_blocks =
      (uint32_t)GPR_MAX(2 * g_log.num_cores, (size_in_mb << CL_LOG_2_MB) >>
                                                 CENSUS_LOG_2_MAX_RECORD_SIZE);
  gpr_mu_init(&g_log.lock);
  g_log.read_iterator_state = 0;
  g_log.block_being_read = NULL;
  g_log.core_local_blocks = (cl_core_local_block*)gpr_malloc_aligned(
      g_log.num_cores * sizeof(cl_core_local_block), GPR_CACHELINE_SIZE_LOG);
  memset(g_log.core_local_blocks, 0,
         g_log.num_cores * sizeof(cl_core_local_block));
  g_log.blocks = (cl_block*)gpr_malloc_aligned(
      g_log.num_blocks * sizeof(cl_block), GPR_CACHELINE_SIZE_LOG);
  memset(g_log.blocks, 0, g_log.num_blocks * sizeof(cl_block));
  g_log.buffer = gpr_malloc(g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
  memset(g_log.buffer, 0, g_log.num_blocks * CENSUS_LOG_MAX_RECORD_SIZE);
  cl_block_list_initialize(&g_log.free_block_list);
  cl_block_list_initialize(&g_log.dirty_block_list);
  for (uint32_t i = 0; i < g_log.num_blocks; ++i) {
    cl_block* block = g_log.blocks + i;
    cl_block_initialize(block, g_log.buffer + (CENSUS_LOG_MAX_RECORD_SIZE * i));
    cl_block_try_disable_access(block, 1 /* discard data */);
    cl_block_list_insert_at_tail(&g_log.free_block_list, block);
  }
  gpr_atm_rel_store(&g_log.out_of_space_count, 0);
  g_log.initialized = 1;
}

void census_log_shutdown(void) {
  GPR_ASSERT(g_log.initialized);
  gpr_mu_destroy(&g_log.lock);
  gpr_free_aligned(g_log.core_local_blocks);
  g_log.core_local_blocks = NULL;
  gpr_free_aligned(g_log.blocks);
  g_log.blocks = NULL;
  gpr_free(g_log.buffer);
  g_log.buffer = NULL;
  g_log.initialized = 0;
}

void* census_log_start_write(size_t size) {
  // Used to bound number of times block allocation is attempted.
  GPR_ASSERT(size > 0);
  GPR_ASSERT(g_log.initialized);
  if (size > CENSUS_LOG_MAX_RECORD_SIZE) {
    return NULL;
  }
  uint32_t attempts_remaining = g_log.num_blocks;
  uint32_t core_id = gpr_cpu_current_cpu();
  do {
    void* record = NULL;
    cl_block* block =
        cl_core_local_block_get_block(&g_log.core_local_blocks[core_id]);
    if (block && (record = cl_block_start_write(block, size))) {
      return record;
    }
    // Need to allocate a new block. We are here if:
    // - No block associated with the core OR
    // - Write in-progress on the block OR
    // - block is out of space
    gpr_mu_lock(&g_log.lock);
    bool allocated = cl_allocate_core_local_block(core_id, block);
    gpr_mu_unlock(&g_log.lock);
    if (!allocated) {
      gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
      return NULL;
    }
  } while (attempts_remaining--);
  // Give up.
  gpr_atm_no_barrier_fetch_add(&g_log.out_of_space_count, 1);
  return NULL;
}

void census_log_end_write(void* record, size_t bytes_written) {
  GPR_ASSERT(g_log.initialized);
  cl_block_end_write(cl_get_block(record), bytes_written);
}

void census_log_init_reader(void) {
  GPR_ASSERT(g_log.initialized);
  gpr_mu_lock(&g_log.lock);
  // If a block is locked for reading unlock it.
  if (g_log.block_being_read != NULL) {
    cl_block_end_read(g_log.block_being_read);
    g_log.block_being_read = NULL;
  }
  g_log.read_iterator_state = g_log.num_cores;
  gpr_mu_unlock(&g_log.lock);
}

const void* census_log_read_next(size_t* bytes_available) {
  GPR_ASSERT(g_log.initialized);
  gpr_mu_lock(&g_log.lock);
  if (g_log.block_being_read != NULL) {
    cl_block_end_read(g_log.block_being_read);
  }
  do {
    g_log.block_being_read = cl_next_block_to_read(g_log.block_being_read);
    if (g_log.block_being_read != NULL) {
      void* record =
          cl_block_start_read(g_log.block_being_read, bytes_available);
      if (record != NULL) {
        gpr_mu_unlock(&g_log.lock);
        return record;
      }
    }
  } while (g_log.block_being_read != NULL);
  gpr_mu_unlock(&g_log.lock);
  return NULL;
}

size_t census_log_remaining_space(void) {
  GPR_ASSERT(g_log.initialized);
  size_t space = 0;
  gpr_mu_lock(&g_log.lock);
  if (g_log.discard_old_records) {
    // Remaining space is not meaningful; just return the entire log space.
    space = g_log.num_blocks << CENSUS_LOG_2_MAX_RECORD_SIZE;
  } else {
    GPR_ASSERT(g_log.free_block_list.count >= 0);
    space = (size_t)g_log.free_block_list.count * CENSUS_LOG_MAX_RECORD_SIZE;
  }
  gpr_mu_unlock(&g_log.lock);
  return space;
}

int64_t census_log_out_of_space_count(void) {
  GPR_ASSERT(g_log.initialized);
  return gpr_atm_acq_load(&g_log.out_of_space_count);
}