行到列的转换#

固定模式#

以下示例将结构体数组转换为 arrow::Table 实例，然后将其转换回原始结构体数组。

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// https://apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <arrow/api.h>
#include <arrow/result.h>

#include <cstdint>
#include <iomanip>
#include <iostream>
#include <vector>

using arrow::DoubleBuilder;
using arrow::Int64Builder;
using arrow::ListBuilder;

// While we want to use columnar data structures to build efficient operations, we
// often receive data in a row-wise fashion from other systems. In the following,
// we want give a brief introduction into the classes provided by Apache Arrow by
// showing how to transform row-wise data into a columnar table.
//
// The table contains an id for a product, the number of components in the product
// and the cost of each component.
//
// The data in this example is stored in the following struct:
struct data_row {
  int64_t id;
  int64_t components;
  std::vector<double> component_cost;
};

// Transforming a vector of structs into a columnar Table.
//
// The final representation should be an `arrow::Table` which in turn
// is made up of an `arrow::Schema` and a list of
// `arrow::ChunkedArray` instances. As the first step, we will iterate
// over the data and build up the arrays incrementally.  For this
// task, we provide `arrow::ArrayBuilder` classes that help in the
// construction of the final `arrow::Array` instances.
//
// For each type, Arrow has a specially typed builder class. For the primitive
// values `id` and `components` we can use the `arrow::Int64Builder`. For the
// `component_cost` vector, we need to have two builders, a top-level
// `arrow::ListBuilder` that builds the array of offsets and a nested
// `arrow::DoubleBuilder` that constructs the underlying values array that
// is referenced by the offsets in the former array.
arrow::Result<std::shared_ptr<arrow::Table>> VectorToColumnarTable(
    const std::vector<struct data_row>& rows) {
  // The builders are more efficient using
  // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
  // the underlying memory regions in-place. At the moment, arrow::jemalloc is only
  // supported on Unix systems, not Windows.
  arrow::MemoryPool* pool = arrow::default_memory_pool();

  Int64Builder id_builder(pool);
  Int64Builder components_builder(pool);
  ListBuilder component_cost_builder(pool, std::make_shared<DoubleBuilder>(pool));
  // The following builder is owned by component_cost_builder.
  DoubleBuilder* component_item_cost_builder =
      (static_cast<DoubleBuilder*>(component_cost_builder.value_builder()));

  // Now we can loop over our existing data and insert it into the builders. The
  // `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
  // Thus we need to check their return values. For more information on these values,
  // check the documentation about `arrow::Status`.
  for (const data_row& row : rows) {
    ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
    ARROW_RETURN_NOT_OK(components_builder.Append(row.components));

    // Indicate the start of a new list row. This will memorise the current
    // offset in the values builder.
    ARROW_RETURN_NOT_OK(component_cost_builder.Append());
    // Store the actual values. The same memory layout is
    // used for the component cost data, in this case a vector of
    // type double, as for the memory that Arrow uses to hold this
    // data and will be created.
    ARROW_RETURN_NOT_OK(component_item_cost_builder->AppendValues(
        row.component_cost.data(), row.component_cost.size()));
  }

  // At the end, we finalise the arrays, declare the (type) schema and combine them
  // into a single `arrow::Table`:
  std::shared_ptr<arrow::Array> id_array;
  ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
  std::shared_ptr<arrow::Array> components_array;
  ARROW_RETURN_NOT_OK(components_builder.Finish(&components_array));
  // No need to invoke component_item_cost_builder.Finish because it is implied by
  // the parent builder's Finish invocation.
  std::shared_ptr<arrow::Array> component_cost_array;
  ARROW_RETURN_NOT_OK(component_cost_builder.Finish(&component_cost_array));

  std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
      arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
      arrow::field("component_cost", arrow::list(arrow::float64()))};

  auto schema = std::make_shared<arrow::Schema>(schema_vector);

  // The final `table` variable is the one we can then pass on to other functions
  // that can consume Apache Arrow memory structures. This object has ownership of
  // all referenced data, thus we don't have to care about undefined references once
  // we leave the scope of the function building the table and its underlying arrays.
  std::shared_ptr<arrow::Table> table =
      arrow::Table::Make(schema, {id_array, components_array, component_cost_array});

  return table;
}

arrow::Result<std::vector<data_row>> ColumnarTableToVector(
    const std::shared_ptr<arrow::Table>& table) {
  // To convert an Arrow table back into the same row-wise representation as in the
  // above section, we first will check that the table conforms to our expected
  // schema and then will build up the vector of rows incrementally.
  //
  // For the check if the table is as expected, we can utilise solely its schema.
  std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
      arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
      arrow::field("component_cost", arrow::list(arrow::float64()))};
  auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);

  if (!expected_schema->Equals(*table->schema())) {
    // The table doesn't have the expected schema thus we cannot directly
    // convert it to our target representation.
    return arrow::Status::Invalid("Schemas are not matching!");
  }

  // As we have ensured that the table has the expected structure, we can unpack the
  // underlying arrays. For the primitive columns `id` and `components` we can use the
  // high level functions to get the values whereas for the nested column
  // `component_costs` we need to access the C-pointer to the data to copy its
  // contents into the resulting `std::vector<double>`. Here we need to be careful to
  // also add the offset to the pointer. This offset is needed to enable zero-copy
  // slicing operations. While this could be adjusted automatically for double
  // arrays, this cannot be done for the accompanying bitmap as often the slicing
  // border would be inside a byte.

  auto ids = std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
  auto components =
      std::static_pointer_cast<arrow::Int64Array>(table->column(1)->chunk(0));
  auto component_cost =
      std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
  auto component_cost_values =
      std::static_pointer_cast<arrow::DoubleArray>(component_cost->values());
  // To enable zero-copy slices, the native values pointer might need to account
  // for this slicing offset. This is not needed for the higher level functions
  // like Value(…) that already account for this offset internally.
  const double* ccv_ptr = component_cost_values->raw_values();
  std::vector<data_row> rows;
  for (int64_t i = 0; i < table->num_rows(); i++) {
    // Another simplification in this example is that we assume that there are
    // no null entries, e.g. each row is fill with valid values.
    int64_t id = ids->Value(i);
    int64_t component = components->Value(i);
    const double* first = ccv_ptr + component_cost->value_offset(i);
    const double* last = ccv_ptr + component_cost->value_offset(i + 1);
    std::vector<double> components_vec(first, last);
    rows.push_back({id, component, components_vec});
  }

  return rows;
}

arrow::Status RunRowConversion() {
  std::vector<data_row> original_rows = {
      {1, 1, {10.0}}, {2, 3, {11.0, 12.0, 13.0}}, {3, 2, {15.0, 25.0}}};
  std::shared_ptr<arrow::Table> table;
  std::vector<data_row> converted_rows;

  ARROW_ASSIGN_OR_RAISE(table, VectorToColumnarTable(original_rows));

  ARROW_ASSIGN_OR_RAISE(converted_rows, ColumnarTableToVector(table));

  assert(original_rows.size() == converted_rows.size());

  // Print out contents of table, should get
  // ID Components Component prices
  // 1  1          10
  // 2  3          11  12  13
  // 3  2          15  25
  std::cout << std::left << std::setw(3) << "ID " << std::left << std::setw(11)
            << "Components " << std::left << std::setw(15) << "Component prices "
            << std::endl;
  for (const auto& row : converted_rows) {
    std::cout << std::left << std::setw(3) << row.id << std::left << std::setw(11)
              << row.components;
    for (const auto& cost : row.component_cost) {
      std::cout << std::left << std::setw(4) << cost;
    }
    std::cout << std::endl;
  }
  return arrow::Status::OK();
}

int main(int argc, char** argv) {
  auto status = RunRowConversion();
  if (!status.ok()) {
    std::cerr << status.ToString() << std::endl;
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
}

动态模式#

在许多情况下，我们需要转换到和从运行时模式未知的数据行。为了帮助实现这些转换，此库提供了几个实用程序

arrow::RecordBatchBuilder：为完整记录批次创建和管理数组构建器。
arrow::VisitTypeInline()：分派到专门针对给定数组类型的函数。
类型特性（例如 arrow::enable_if_primitive_ctype）：将模板函数缩小到特定的 Arrow 类型，与访问者模式结合使用非常有用。
arrow::TableBatchReader：一次读取一个批次的表，每个批次都是一个零拷贝切片。

以下示例展示了如何在 rapidjson::Document 和 Arrow 对象之间实现转换。您可以在 apache/arrow 上读取完整的代码示例

编写到 Arrow 的转换#

要将行转换为 Arrow 记录批次，我们将为所有列设置数组构建器，然后对于每个字段，遍历行值并附加到构建器。我们假设我们已经知道目标模式，该模式可能由另一个系统提供，或者是在另一个函数中推断出来的。在转换过程中推断模式是一项具有挑战性的建议；许多系统将检查前 N 行以推断模式（如果尚无可用模式）。

在顶层，我们定义一个函数 ConvertToRecordBatch

arrow::Result<std::shared_ptr<arrow::RecordBatch>> ConvertToRecordBatch(
    const std::vector<rapidjson::Document>& rows, std::shared_ptr<arrow::Schema> schema) {
  // RecordBatchBuilder will create array builders for us for each field in our
  // schema. By passing the number of output rows (`rows.size()`) we can
  // pre-allocate the correct size of arrays, except of course in the case of
  // string, byte, and list arrays, which have dynamic lengths.
  std::unique_ptr<arrow::RecordBatchBuilder> batch_builder;
  ARROW_ASSIGN_OR_RAISE(
      batch_builder,
      arrow::RecordBatchBuilder::Make(schema, arrow::default_memory_pool(), rows.size()));

  // Inner converter will take rows and be responsible for appending values
  // to provided array builders.
  JsonValueConverter converter(rows);
  for (int i = 0; i < batch_builder->num_fields(); ++i) {
    std::shared_ptr<arrow::Field> field = schema->field(i);
    arrow::ArrayBuilder* builder = batch_builder->GetField(i);
    ARROW_RETURN_NOT_OK(converter.Convert(*field.get(), builder));
  }

  std::shared_ptr<arrow::RecordBatch> batch;
  ARROW_ASSIGN_OR_RAISE(batch, batch_builder->Flush());

  // Use RecordBatch::ValidateFull() to make sure arrays were correctly constructed.
  DCHECK_OK(batch->ValidateFull());
  return batch;
}  // ConvertToRecordBatch

首先，我们使用 arrow::RecordBatchBuilder，它可以方便地为模式中的每个字段创建构建器。然后我们迭代模式的字段，获取构建器，并对我们的 JsonValueConverter 调用 Convert()（接下来讨论）。最后，我们调用 batch->ValidateFull()，它检查我们的数组的完整性，以确保转换正确执行，这对于调试新的转换实现非常有用。

再下一层，JsonValueConverter 负责将提供的字段的行值附加到提供的数组构建器。为了专门化每种数据类型的逻辑，它实现了 Visit 方法并调用 arrow::VisitTypeInline()。（有关类型访问者的更多信息，请参见访问者模式。）

在该类的末尾是私有方法 FieldValues()，它返回跨行的当前字段的列值的迭代器。在扁平的基于行的结构（例如值的向量）中，这可能很容易实现。但是，如果模式是嵌套的，例如在 JSON 文档的情况下，则需要一个特殊的迭代器来导航嵌套的级别。有关 DocValuesIterator 的实现细节，请参见完整示例。

class JsonValueConverter {
 public:
  explicit JsonValueConverter(const std::vector<rapidjson::Document>& rows)
      : rows_(rows), array_levels_(0) {}

  JsonValueConverter(const std::vector<rapidjson::Document>& rows,
                     const std::vector<std::string>& root_path, int64_t array_levels)
      : rows_(rows), root_path_(root_path), array_levels_(array_levels) {}

  /// \brief For field passed in, append corresponding values to builder
  arrow::Status Convert(const arrow::Field& field, arrow::ArrayBuilder* builder) {
    return Convert(field, field.name(), builder);
  }

  /// \brief For field passed in, append corresponding values to builder
  arrow::Status Convert(const arrow::Field& field, const std::string& field_name,
                        arrow::ArrayBuilder* builder) {
    field_name_ = field_name;
    builder_ = builder;
    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field.type().get(), this));
    return arrow::Status::OK();
  }

  // Default implementation
  arrow::Status Visit(const arrow::DataType& type) {
    return arrow::Status::NotImplemented(
        "Cannot convert json value to Arrow array of type ", type.ToString());
  }

  arrow::Status Visit(const arrow::Int64Type& type) {
    arrow::Int64Builder* builder = static_cast<arrow::Int64Builder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        if (value->IsUint()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetUint()));
        } else if (value->IsInt()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetInt()));
        } else if (value->IsUint64()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetUint64()));
        } else if (value->IsInt64()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetInt64()));
        } else {
          return arrow::Status::Invalid("Value is not an integer");
        }
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::DoubleType& type) {
    arrow::DoubleBuilder* builder = static_cast<arrow::DoubleBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetDouble()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StringType& type) {
    arrow::StringBuilder* builder = static_cast<arrow::StringBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetString()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::BooleanType& type) {
    arrow::BooleanBuilder* builder = static_cast<arrow::BooleanBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetBool()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StructType& type) {
    arrow::StructBuilder* builder = static_cast<arrow::StructBuilder*>(builder_);

    std::vector<std::string> child_path(root_path_);
    if (field_name_.size() > 0) {
      child_path.push_back(field_name_);
    }
    auto child_converter = JsonValueConverter(rows_, child_path, array_levels_);

    for (int i = 0; i < type.num_fields(); ++i) {
      std::shared_ptr<arrow::Field> child_field = type.field(i);
      std::shared_ptr<arrow::ArrayBuilder> child_builder = builder->child_builder(i);

      ARROW_RETURN_NOT_OK(
          child_converter.Convert(*child_field.get(), child_builder.get()));
    }

    // Make null bitmap
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
    }

    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::ListType& type) {
    arrow::ListBuilder* builder = static_cast<arrow::ListBuilder*>(builder_);

    // Values and offsets needs to be interleaved in ListBuilder, so first collect the
    // values
    std::unique_ptr<arrow::ArrayBuilder> tmp_value_builder;
    ARROW_ASSIGN_OR_RAISE(tmp_value_builder,
                          arrow::MakeBuilder(builder->value_builder()->type()));
    std::vector<std::string> child_path(root_path_);
    child_path.push_back(field_name_);
    auto child_converter = JsonValueConverter(rows_, child_path, array_levels_ + 1);
    ARROW_RETURN_NOT_OK(
        child_converter.Convert(*type.value_field().get(), "", tmp_value_builder.get()));

    std::shared_ptr<arrow::Array> values_array;
    ARROW_RETURN_NOT_OK(tmp_value_builder->Finish(&values_array));
    std::shared_ptr<arrow::ArrayData> values_data = values_array->data();

    arrow::ArrayBuilder* value_builder = builder->value_builder();
    int64_t offset = 0;
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
      if (!value->IsNull() && value->Size() > 0) {
        ARROW_RETURN_NOT_OK(
            value_builder->AppendArraySlice(*values_data.get(), offset, value->Size()));
        offset += value->Size();
      }
    }

    return arrow::Status::OK();
  }

 private:
  std::string field_name_;
  arrow::ArrayBuilder* builder_;
  const std::vector<rapidjson::Document>& rows_;
  std::vector<std::string> root_path_;
  int64_t array_levels_;

  /// Return a flattened iterator over values at nested location
  arrow::Iterator<const rapidjson::Value*> FieldValues() {
    std::vector<std::string> path(root_path_);
    if (field_name_.size() > 0) {
      path.push_back(field_name_);
    }
    auto iter = DocValuesIterator(rows_, std::move(path), array_levels_);
    auto fn = [iter]() mutable -> arrow::Result<const rapidjson::Value*> {
      return iter.Next();
    };

    return arrow::MakeFunctionIterator(fn);
  }
};  // JsonValueConverter

编写从 Arrow 的转换#

要从 Arrow 记录批次转换为行，我们将以较小的批次处理表，访问批次的每个字段，并逐列填充输出行。

在顶层，我们定义 ArrowToDocumentConverter，它提供将 Arrow 批次和表转换为行的 API。在许多情况下，以较小的批次执行到行的转换比一次完成整个表更优。因此，我们定义了一个 ConvertToVector 方法来转换单个批次，然后在另一个转换方法中，我们使用 arrow::TableBatchReader 迭代表的切片。这将返回 Arrow 的迭代器类型 (arrow::Iterator)，因此可以一次处理一行或将行收集到容器中。

class ArrowToDocumentConverter {
 public:
  /// Convert a single batch of Arrow data into Documents
  arrow::Result<std::vector<rapidjson::Document>> ConvertToVector(
      std::shared_ptr<arrow::RecordBatch> batch) {
    RowBatchBuilder builder{batch->num_rows()};

    for (int i = 0; i < batch->num_columns(); ++i) {
      builder.SetField(batch->schema()->field(i).get());
      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*batch->column(i).get(), &builder));
    }

    return std::move(builder).Rows();
  }

  /// Convert an Arrow table into an iterator of Documents
  arrow::Iterator<rapidjson::Document> ConvertToIterator(
      std::shared_ptr<arrow::Table> table, size_t batch_size) {
    // Use TableBatchReader to divide table into smaller batches. The batches
    // created are zero-copy slices with *at most* `batch_size` rows.
    auto batch_reader = std::make_shared<arrow::TableBatchReader>(*table);
    batch_reader->set_chunksize(batch_size);

    auto read_batch = [this](const std::shared_ptr<arrow::RecordBatch>& batch)
        -> arrow::Result<arrow::Iterator<rapidjson::Document>> {
      ARROW_ASSIGN_OR_RAISE(auto rows, ConvertToVector(batch));
      return arrow::MakeVectorIterator(std::move(rows));
    };

    auto nested_iter = arrow::MakeMaybeMapIterator(
        read_batch, arrow::MakeIteratorFromReader(std::move(batch_reader)));

    return arrow::MakeFlattenIterator(std::move(nested_iter));
  }
};  // ArrowToDocumentConverter

再下一层，输出行由 RowBatchBuilder 填充。RowBatchBuilder 实现了 Visit() 方法，但为了节省代码，我们为具有原始 C 等价物（布尔值、整数和浮点数）的数组类型编写了一个模板方法，使用 arrow::enable_if_primitive_ctype。有关其他类型谓词，请参见类型特性。

class RowBatchBuilder {
 public:
  explicit RowBatchBuilder(int64_t num_rows) : field_(nullptr) {
    // Reserve all of the space required up-front to avoid unnecessary resizing
    rows_.reserve(num_rows);

    for (int64_t i = 0; i < num_rows; ++i) {
      rows_.push_back(rapidjson::Document());
      rows_[i].SetObject();
    }
  }

  /// \brief Set which field to convert.
  void SetField(const arrow::Field* field) { field_ = field; }

  /// \brief Retrieve converted rows from builder.
  std::vector<rapidjson::Document> Rows() && { return std::move(rows_); }

  // Default implementation
  arrow::Status Visit(const arrow::Array& array) {
    return arrow::Status::NotImplemented(
        "Cannot convert to json document for array of type ", array.type()->ToString());
  }

  // Handles booleans, integers, floats
  template <typename ArrayType, typename DataClass = typename ArrayType::TypeClass>
  arrow::enable_if_primitive_ctype<DataClass, arrow::Status> Visit(
      const ArrayType& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, array.Value(i), rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StringArray& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        std::string_view value_view = array.Value(i);
        rapidjson::Value value;
        value.SetString(value_view.data(),
                        static_cast<rapidjson::SizeType>(value_view.size()),
                        rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, value, rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StructArray& array) {
    const arrow::StructType* type = array.struct_type();

    assert(static_cast<int64_t>(rows_.size()) == array.length());

    RowBatchBuilder child_builder(rows_.size());
    for (int i = 0; i < type->num_fields(); ++i) {
      const arrow::Field* child_field = type->field(i).get();
      child_builder.SetField(child_field);
      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array.field(i).get(), &child_builder));
    }
    std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();

    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        // Must copy value to new allocator
        rapidjson::Value row_val;
        row_val.CopyFrom(rows[i], rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, row_val, rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::ListArray& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    // First create rows from values
    std::shared_ptr<arrow::Array> values = array.values();
    RowBatchBuilder child_builder(values->length());
    const arrow::Field* value_field = array.list_type()->value_field().get();
    std::string value_field_name = value_field->name();
    child_builder.SetField(value_field);
    ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*values.get(), &child_builder));

    std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();

    int64_t values_i = 0;
    for (int64_t i = 0; i < array.length(); ++i) {
      if (array.IsNull(i)) continue;

      rapidjson::Document::AllocatorType& allocator = rows_[i].GetAllocator();
      auto array_len = array.value_length(i);

      rapidjson::Value value;
      value.SetArray();
      value.Reserve(array_len, allocator);

      for (int64_t j = 0; j < array_len; ++j) {
        rapidjson::Value row_val;
        // Must copy value to new allocator
        row_val.CopyFrom(rows[values_i][value_field_name], allocator);
        value.PushBack(row_val, allocator);
        ++values_i;
      }

      rapidjson::Value str_key(field_->name(), allocator);
      rows_[i].AddMember(str_key, value, allocator);
    }

    return arrow::Status::OK();
  }

 private:
  const arrow::Field* field_;
  std::vector<rapidjson::Document> rows_;
};  // RowBatchBuilder