创建 Arrow 对象

与创建数组、表格、张量和其他所有 Arrow 实体相关的食谱。

从标准 C++ 创建数组

arrow::ArrayBuilder 的类型化子类使从现有 C++ 数据高效地创建 Arrow 数组变得容易

从 C++ 原语创建数组
arrow::Int32Builder builder;
ARROW_RETURN_NOT_OK(builder.Append(1));
ARROW_RETURN_NOT_OK(builder.Append(2));
ARROW_RETURN_NOT_OK(builder.Append(3));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, builder.Finish())
rout << arr->ToString() << std::endl;
代码输出
[
  1,
  2,
  3
]

注意

构建器将根据需要分配数据,插入应具有恒定的摊销时间。

构建器还可以使用标准 C++ 容器

// Raw pointers
arrow::Int64Builder long_builder = arrow::Int64Builder();
std::array<int64_t, 4> values = {1, 2, 3, 4};
ARROW_RETURN_NOT_OK(long_builder.AppendValues(values.data(), values.size()));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, long_builder.Finish());
rout << arr->ToString() << std::endl;

// Vectors
arrow::StringBuilder str_builder = arrow::StringBuilder();
std::vector<std::string> strvals = {"x", "y", "z"};
ARROW_RETURN_NOT_OK(str_builder.AppendValues(strvals));
ARROW_ASSIGN_OR_RAISE(arr, str_builder.Finish());
rout << arr->ToString() << std::endl;

// Iterators
arrow::DoubleBuilder dbl_builder = arrow::DoubleBuilder();
std::set<double> dblvals = {1.1, 1.1, 2.3};
ARROW_RETURN_NOT_OK(dbl_builder.AppendValues(dblvals.begin(), dblvals.end()));
ARROW_ASSIGN_OR_RAISE(arr, dbl_builder.Finish());
rout << arr->ToString() << std::endl;
代码输出
[
  1,
  2,
  3,
  4
]
[
  "x",
  "y",
  "z"
]
[
  1.1,
  2.3
]

注意

构建器不会获取容器中数据的拥有权,并将复制底层数据。

为给定模式生成随机数据

要为给定模式生成随机数据,实现类型访问器是一个好主意。以下示例仅实现了双精度数组和列表数组,但可以轻松扩展到所有类型。

使用访问器模式生成随机记录批次
 1class RandomBatchGenerator {
 2 public:
 3  std::shared_ptr<arrow::Schema> schema;
 4
 5  RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : schema(schema){};
 6
 7  arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t num_rows) {
 8    num_rows_ = num_rows;
 9    for (std::shared_ptr<arrow::Field> field : schema->fields()) {
10      ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
11    }
12
13    return arrow::RecordBatch::Make(schema, num_rows, arrays_);
14  }
15
16  // Default implementation
17  arrow::Status Visit(const arrow::DataType& type) {
18    return arrow::Status::NotImplemented("Generating data for", type.ToString());
19  }
20
21  arrow::Status Visit(const arrow::DoubleType&) {
22    auto builder = arrow::DoubleBuilder();
23    std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
24    for (int32_t i = 0; i < num_rows_; ++i) {
25      ARROW_RETURN_NOT_OK(builder.Append(d(gen_)));
26    }
27    ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
28    arrays_.push_back(array);
29    return arrow::Status::OK();
30  }
31
32  arrow::Status Visit(const arrow::ListType& type) {
33    // Generate offsets first, which determines number of values in sub-array
34    std::poisson_distribution<> d{/*mean=*/4};
35    auto builder = arrow::Int32Builder();
36    ARROW_RETURN_NOT_OK(builder.Append(0));
37    int32_t last_val = 0;
38    for (int32_t i = 0; i < num_rows_; ++i) {
39      last_val += d(gen_);
40      ARROW_RETURN_NOT_OK(builder.Append(last_val));
41    }
42    ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
43
44    // Since children of list has a new length, will use a new generator
45    RandomBatchGenerator value_gen(arrow::schema({arrow::field("x", type.value_type())}));
46    // Last index from the offsets array becomes the length of the sub-array
47    ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
48    std::shared_ptr<arrow::Array> values = inner_batch->column(0);
49
50    ARROW_ASSIGN_OR_RAISE(auto array,
51                          arrow::ListArray::FromArrays(*offsets.get(), *values.get()));
52    arrays_.push_back(array);
53
54    return arrow::Status::OK();
55  }
56
57 protected:
58  std::random_device rd_{};
59  std::mt19937 gen_{rd_()};
60  std::vector<std::shared_ptr<arrow::Array>> arrays_;
61  int32_t num_rows_;
62};  // RandomBatchGenerator

有了这样的生成器,您可以为任何支持的模式创建随机测试数据

std::shared_ptr<arrow::Schema> schema =
    arrow::schema({arrow::field("x", arrow::float64()),
                   arrow::field("y", arrow::list(arrow::float64()))});

RandomBatchGenerator generator(schema);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, generator.Generate(5));

rout << "Created batch: \n" << batch->ToString();

// Consider using ValidateFull to check correctness
ARROW_RETURN_NOT_OK(batch->ValidateFull());
代码输出
Created batch: 
x:   [
    4.546911589795752,
    6.984533198458078,
    7.617112892424505,
    7.071039704261608,
    5.333380507036075
  ]
y:   [
    [
      6.162093180569001,
      4.264271666435832,
      4.453379826203139
    ],
    [
      5.550493157228391,
      2.2790346108514914,
      6.320687795635024,
      5.790474643286342
    ],
    [
      6.1749549303569,
      1.2247191609769907,
      10.309335708651332,
      2.7148579213976567,
      0.7332353370369562,
      7.925025202564361,
      4.011131470597689
    ],
    [
      3.051431659823732,
      6.459224633329098,
      6.545469562979236,
      4.2098221381083905,
      4.227733269678735,
      5.916080551640544
    ],
    [
      5.996692460353367,
      3.8667241669428876,
      1.3804329308731353,
      5.711691758211411,
      3.4554154047425714,
      3.102919934591531
    ]
  ]