创建 Arrow 对象

向量是 Arrow Java 库中的基本单元。数据类型描述了值的类型;ValueVectors 是类型化值的序列。向量表示相同类型值的单维序列。它们是可变容器。

向量实现接口 ValueVector。Arrow 库为各种数据类型提供了向量实现。

创建向量(数组)

Int 数组

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector intVector = new IntVector("intVector", allocator)
) {
    intVector.allocateNew(3);
    intVector.set(0, 1);
    intVector.set(1, 2);
    intVector.set(2, 3);
    intVector.setValueCount(3);

    System.out.print(intVector);
}
[1, 2, 3]

Varchar 数组

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;

try(
    BufferAllocator allocator = new RootAllocator();
    VarCharVector varCharVector = new VarCharVector("varCharVector", allocator);
) {
    varCharVector.allocateNew(3);
    varCharVector.set(0, "one".getBytes());
    varCharVector.set(1, "two".getBytes());
    varCharVector.set(2, "three".getBytes());
    varCharVector.setValueCount(3);

    System.out.print(varCharVector);
}
[one, two, three]

字典编码的 Varchar 数组

在某些情况下,字典编码 列有助于节省内存。

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.dictionary.Dictionary;
import org.apache.arrow.vector.dictionary.DictionaryEncoder;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;

import java.nio.charset.StandardCharsets;

try (BufferAllocator root = new RootAllocator();
     VarCharVector countries = new VarCharVector("country-dict", root);
     VarCharVector appUserCountriesUnencoded = new VarCharVector("app-use-country-dict", root)
) {
    countries.allocateNew(10);
    countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
    countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
    countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
    countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
    countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
    countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
    countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
    countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
    countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
    countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
    countries.setValueCount(10);

    Dictionary countriesDictionary = new Dictionary(countries,
            new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/new ArrowType.Int(8, true)));
    System.out.println("Dictionary: " + countriesDictionary);

    appUserCountriesUnencoded.allocateNew(5);
    appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
    appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
    appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
    appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
    appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
    appUserCountriesUnencoded.setValueCount(5);
    System.out.println("Unencoded data: " + appUserCountriesUnencoded);

    try (FieldVector appUserCountriesDictionaryEncoded = (FieldVector) DictionaryEncoder
            .encode(appUserCountriesUnencoded, countriesDictionary)) {
        System.out.println("Dictionary-encoded data: " + appUserCountriesDictionaryEncoded);
    }
}
Dictionary: Dictionary DictionaryEncoding[id=1,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
Dictionary-encoded data: [0, 3, 4, 5, 7]

List 数组

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.complex.ListVector;

try(
    BufferAllocator allocator = new RootAllocator();
    ListVector listVector = ListVector.empty("listVector", allocator);
    UnionListWriter listWriter = listVector.getWriter()
) {
    int[] data = new int[] { 1, 2, 3, 10, 20, 30, 100, 200, 300, 1000, 2000, 3000 };
    int tmp_index = 0;
    for(int i = 0; i < 4; i++) {
        listWriter.setPosition(i);
        listWriter.startList();
        for(int j = 0; j < 3; j++) {
            listWriter.writeInt(data[tmp_index]);
            tmp_index = tmp_index + 1;
        }
        listWriter.setValueCount(3);
        listWriter.endList();
    }
    listVector.setValueCount(4);

    System.out.print(listVector);
} catch (Exception e) {
    e.printStackTrace();
}
[[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]

切片

切片提供了一种在相同类型两个向量之间复制行范围的方法。

切片 IntVector

在这个例子中,我们将输入 IntVector 的一部分复制到一个新的 IntVector 中。

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.util.TransferPair;

try (BufferAllocator allocator = new RootAllocator();
    IntVector vector = new IntVector("intVector", allocator)) {
    for (int i = 0; i < 10; i++) {
        vector.setSafe(i, i);
     }
    vector.setValueCount(10);

    TransferPair tp = vector.getTransferPair(allocator);
    tp.splitAndTransfer(0, 5);
    try (IntVector sliced = (IntVector) tp.getTo()) {
        System.out.println(sliced);
    }

    tp = vector.getTransferPair(allocator);
    // copy 6 elements from index 2
    tp.splitAndTransfer(2, 6);
    try (IntVector sliced = (IntVector) tp.getTo()) {
        System.out.print(sliced);
    }
}
[0, 1, 2, 3, 4]
[2, 3, 4, 5, 6, 7]