Arrow Flight¶
本节包含一些使用 Arrow Flight 的示例。有关 Flight 的更多详细信息,请查看 Arrow Flight RPC。
使用 Arrow Flight 的简单键值存储服务¶
我们将实现一个服务,该服务使用 Flight 处理上传/请求,并使用内存中的数据存储实际数据,从而为数据提供键值存储。
Flight 客户端和服务器¶
import org.apache.arrow.flight.Action;
import org.apache.arrow.flight.AsyncPutListener;
import org.apache.arrow.flight.CallStatus;
import org.apache.arrow.flight.Criteria;
import org.apache.arrow.flight.FlightClient;
import org.apache.arrow.flight.FlightDescriptor;
import org.apache.arrow.flight.FlightEndpoint;
import org.apache.arrow.flight.FlightInfo;
import org.apache.arrow.flight.FlightServer;
import org.apache.arrow.flight.FlightStream;
import org.apache.arrow.flight.Location;
import org.apache.arrow.flight.NoOpFlightProducer;
import org.apache.arrow.flight.PutResult;
import org.apache.arrow.flight.Result;
import org.apache.arrow.flight.Ticket;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.util.AutoCloseables;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.VectorUnloader;
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
class Dataset implements AutoCloseable {
private final List<ArrowRecordBatch> batches;
private final Schema schema;
private final long rows;
public Dataset(List<ArrowRecordBatch> batches, Schema schema, long rows) {
this.batches = batches;
this.schema = schema;
this.rows = rows;
}
public List<ArrowRecordBatch> getBatches() {
return batches;
}
public Schema getSchema() {
return schema;
}
public long getRows() {
return rows;
}
@Override
public void close() throws Exception {
AutoCloseables.close(batches);
}
}
class CookbookProducer extends NoOpFlightProducer implements AutoCloseable {
private final BufferAllocator allocator;
private final Location location;
private final ConcurrentMap<FlightDescriptor, Dataset> datasets;
public CookbookProducer(BufferAllocator allocator, Location location) {
this.allocator = allocator;
this.location = location;
this.datasets = new ConcurrentHashMap<>();
}
@Override
public Runnable acceptPut(CallContext context, FlightStream flightStream, StreamListener<PutResult> ackStream) {
List<ArrowRecordBatch> batches = new ArrayList<>();
return () -> {
long rows = 0;
VectorUnloader unloader;
while (flightStream.next()) {
unloader = new VectorUnloader(flightStream.getRoot());
final ArrowRecordBatch arb = unloader.getRecordBatch();
batches.add(arb);
rows += flightStream.getRoot().getRowCount();
}
Dataset dataset = new Dataset(batches, flightStream.getSchema(), rows);
datasets.put(flightStream.getDescriptor(), dataset);
ackStream.onCompleted();
};
}
@Override
public void getStream(CallContext context, Ticket ticket, ServerStreamListener listener) {
FlightDescriptor flightDescriptor = FlightDescriptor.path(
new String(ticket.getBytes(), StandardCharsets.UTF_8));
Dataset dataset = this.datasets.get(flightDescriptor);
if (dataset == null) {
throw CallStatus.NOT_FOUND.withDescription("Unknown descriptor").toRuntimeException();
}
try (VectorSchemaRoot root = VectorSchemaRoot.create(
this.datasets.get(flightDescriptor).getSchema(), allocator)) {
VectorLoader loader = new VectorLoader(root);
listener.start(root);
for (ArrowRecordBatch arrowRecordBatch : this.datasets.get(flightDescriptor).getBatches()) {
loader.load(arrowRecordBatch);
listener.putNext();
}
listener.completed();
}
}
@Override
public void doAction(CallContext context, Action action, StreamListener<Result> listener) {
FlightDescriptor flightDescriptor = FlightDescriptor.path(
new String(action.getBody(), StandardCharsets.UTF_8));
switch (action.getType()) {
case "DELETE": {
Dataset removed = datasets.remove(flightDescriptor);
if (removed != null) {
try {
removed.close();
} catch (Exception e) {
listener.onError(CallStatus.INTERNAL
.withDescription(e.toString())
.toRuntimeException());
return;
}
Result result = new Result("Delete completed".getBytes(StandardCharsets.UTF_8));
listener.onNext(result);
} else {
Result result = new Result("Delete not completed. Reason: Key did not exist."
.getBytes(StandardCharsets.UTF_8));
listener.onNext(result);
}
listener.onCompleted();
}
}
}
@Override
public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) {
FlightEndpoint flightEndpoint = new FlightEndpoint(
new Ticket(descriptor.getPath().get(0).getBytes(StandardCharsets.UTF_8)), location);
return new FlightInfo(
datasets.get(descriptor).getSchema(),
descriptor,
Collections.singletonList(flightEndpoint),
/*bytes=*/-1,
datasets.get(descriptor).getRows()
);
}
@Override
public void listFlights(CallContext context, Criteria criteria, StreamListener<FlightInfo> listener) {
datasets.forEach((k, v) -> { listener.onNext(getFlightInfo(null, k)); });
listener.onCompleted();
}
@Override
public void close() throws Exception {
AutoCloseables.close(datasets.values());
}
}
Location location = Location.forGrpcInsecure("0.0.0.0", 33333);
try (BufferAllocator allocator = new RootAllocator()){
// Server
try(final CookbookProducer producer = new CookbookProducer(allocator, location);
final FlightServer flightServer = FlightServer.builder(allocator, location, producer).build()) {
try {
flightServer.start();
System.out.println("S1: Server (Location): Listening on port " + flightServer.getPort());
} catch (IOException e) {
throw new RuntimeException(e);
}
// Client
try (FlightClient flightClient = FlightClient.builder(allocator, location).build()) {
System.out.println("C1: Client (Location): Connected to " + location.getUri());
// Populate data
Schema schema = new Schema(Arrays.asList(
new Field("name", FieldType.nullable(new ArrowType.Utf8()), null)));
try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schema, allocator);
VarCharVector varCharVector = (VarCharVector) vectorSchemaRoot.getVector("name")) {
varCharVector.allocateNew(3);
varCharVector.set(0, "Ronald".getBytes());
varCharVector.set(1, "David".getBytes());
varCharVector.set(2, "Francisco".getBytes());
vectorSchemaRoot.setRowCount(3);
FlightClient.ClientStreamListener listener = flightClient.startPut(
FlightDescriptor.path("profiles"),
vectorSchemaRoot, new AsyncPutListener());
listener.putNext();
varCharVector.set(0, "Manuel".getBytes());
varCharVector.set(1, "Felipe".getBytes());
varCharVector.set(2, "JJ".getBytes());
vectorSchemaRoot.setRowCount(3);
listener.putNext();
listener.completed();
listener.getResult();
System.out.println("C2: Client (Populate Data): Wrote 2 batches with 3 rows each");
}
// Get metadata information
FlightInfo flightInfo = flightClient.getInfo(FlightDescriptor.path("profiles"));
System.out.println("C3: Client (Get Metadata): " + flightInfo);
// Get data information
try(FlightStream flightStream = flightClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) {
int batch = 0;
try (VectorSchemaRoot vectorSchemaRootReceived = flightStream.getRoot()) {
System.out.println("C4: Client (Get Stream):");
while (flightStream.next()) {
batch++;
System.out.println("Client Received batch #" + batch + ", Data:");
System.out.print(vectorSchemaRootReceived.contentToTSVString());
}
}
} catch (Exception e) {
e.printStackTrace();
}
// Get all metadata information
Iterable<FlightInfo> flightInfosBefore = flightClient.listFlights(Criteria.ALL);
System.out.print("C5: Client (List Flights Info): ");
flightInfosBefore.forEach(t -> System.out.println(t));
// Do delete action
Iterator<Result> deleteActionResult = flightClient.doAction(new Action("DELETE",
FlightDescriptor.path("profiles").getPath().get(0).getBytes(StandardCharsets.UTF_8)));
while (deleteActionResult.hasNext()) {
Result result = deleteActionResult.next();
System.out.println("C6: Client (Do Delete Action): " +
new String(result.getBody(), StandardCharsets.UTF_8));
}
// Get all metadata information (to validate detele action)
Iterable<FlightInfo> flightInfos = flightClient.listFlights(Criteria.ALL);
flightInfos.forEach(t -> System.out.println(t));
System.out.println("C7: Client (List Flights Info): After delete - No records");
// Server shut down
flightServer.shutdown();
System.out.println("C8: Server shut down successfully");
}
} catch (Exception e) {
e.printStackTrace();
}
}
S1: Server (Location): Listening on port 33333
C1: Client (Location): Connected to grpc+tcp://0.0.0.0:33333
C2: Client (Populate Data): Wrote 2 batches with 3 rows each
C3: Client (Get Metadata): FlightInfo{schema=Schema<name: Utf8>, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6, ordered=false}
C4: Client (Get Stream):
Client Received batch #1, Data:
name
Ronald
David
Francisco
Client Received batch #2, Data:
name
Manuel
Felipe
JJ
C5: Client (List Flights Info): FlightInfo{schema=Schema<name: Utf8>, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6, ordered=false}
C6: Client (Do Delete Action): Delete completed
C7: Client (List Flights Info): After delete - No records
C8: Server shut down successfully
让我们更详细地解释我们的代码。
启动 Flight 服务器¶
首先,我们将启动我们的服务器
try(FlightServer flightServer = FlightServer.builder(allocator, location,
new CookbookProducer(allocator, location)).build()) {
try {
flightServer.start();
System.out.println("S1: Server (Location): Listening on port " + flightServer.getPort());
} catch (IOException e) {
e.printStackTrace();
}
S1: Server (Location): Listening on port 33333
连接到 Flight 服务器¶
然后,我们可以创建一个客户端并连接到服务器
try (FlightClient flightClient = FlightClient.builder(allocator, location).build()) {
System.out.println("C1: Client (Location): Connected to " + location.getUri());
C1: Client (Location): Connected to grpc+tcp://0.0.0.0:33333
放入数据¶
首先,我们将创建一个向量模式根并将其上传,该模式根将被服务器存储在内存中。
// Server
public Runnable acceptPut(CallContext context, FlightStream flightStream, StreamListener<PutResult> ackStream) {
List<ArrowRecordBatch> batches = new ArrayList<>();
return () -> {
long rows = 0;
VectorUnloader unloader;
while (flightStream.next()) {
unloader = new VectorUnloader(flightStream.getRoot());
try (final ArrowRecordBatch arb = unloader.getRecordBatch()) {
batches.add(arb);
rows += flightStream.getRoot().getRowCount();
}
}
Dataset dataset = new Dataset(batches, flightStream.getSchema(), rows);
datasets.put(flightStream.getDescriptor(), dataset);
ackStream.onCompleted();
};
}
// Client
Schema schema = new Schema(Arrays.asList(
new Field("name", FieldType.nullable(new ArrowType.Utf8()), null)));
try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schema, allocator);
VarCharVector varCharVector = (VarCharVector) vectorSchemaRoot.getVector("name")) {
varCharVector.allocateNew(3);
varCharVector.set(0, "Ronald".getBytes());
varCharVector.set(1, "David".getBytes());
varCharVector.set(2, "Francisco".getBytes());
vectorSchemaRoot.setRowCount(3);
FlightClient.ClientStreamListener listener = flightClient.startPut(
FlightDescriptor.path("profiles"),
vectorSchemaRoot, new AsyncPutListener());
listener.putNext();
varCharVector.set(0, "Manuel".getBytes());
varCharVector.set(1, "Felipe".getBytes());
varCharVector.set(2, "JJ".getBytes());
vectorSchemaRoot.setRowCount(3);
listener.putNext();
listener.completed();
listener.getResult();
System.out.println("C2: Client (Populate Data): Wrote 2 batches with 3 rows each");
}
C2: Client (Populate Data): Wrote 2 batches with 3 rows each
获取元数据¶
这样做之后,我们可以检索该数据集的元数据。
// Server
public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) {
FlightEndpoint flightEndpoint = new FlightEndpoint(
new Ticket(descriptor.getPath().get(0).getBytes(StandardCharsets.UTF_8)), location);
return new FlightInfo(
datasets.get(descriptor).getSchema(),
descriptor,
Collections.singletonList(flightEndpoint),
/*bytes=*/-1,
datasets.get(descriptor).getRows()
);
}
// Client
FlightInfo flightInfo = flightClient.getInfo(FlightDescriptor.path("profiles"));
System.out.println("C3: Client (Get Metadata): " + flightInfo);
C3: Client (Get Metadata): FlightInfo{schema=Schema<name: Utf8>, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6}
获取数据¶
并获取数据。
// Server
public void getStream(CallContext context, Ticket ticket, ServerStreamListener listener) {
FlightDescriptor flightDescriptor = FlightDescriptor.path(
new String(ticket.getBytes(), StandardCharsets.UTF_8));
Dataset dataset = this.datasets.get(flightDescriptor);
if (dataset == null) {
throw CallStatus.NOT_FOUND.withDescription("Unknown descriptor").toRuntimeException();
} else {
VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(
this.datasets.get(flightDescriptor).getSchema(), allocator);
listener.start(vectorSchemaRoot);
for (ArrowRecordBatch arrowRecordBatch : this.datasets.get(flightDescriptor).getBatches()) {
VectorLoader loader = new VectorLoader(vectorSchemaRoot);
loader.load(arrowRecordBatch.cloneWithTransfer(allocator));
listener.putNext();
}
listener.completed();
}
}
// Client
try(FlightStream flightStream = flightClient.getStream(flightInfo.getEndpoints().get(0).getTicket())) {
int batch = 0;
try (VectorSchemaRoot vectorSchemaRootReceived = flightStream.getRoot()) {
System.out.println("C4: Client (Get Stream):");
while (flightStream.next()) {
batch++;
System.out.println("Client Received batch #" + batch + ", Data:");
System.out.print(vectorSchemaRootReceived.contentToTSVString());
}
}
} catch (Exception e) {
e.printStackTrace();
}
C4: Client (Get Stream):
Client Received batch #1, Data:
name
Ronald
David
Francisco
Client Received batch #2, Data:
name
Manuel
Felipe
JJ
删除数据¶
然后,我们将删除数据集。
// Server
public void doAction(CallContext context, Action action, StreamListener<Result> listener) {
FlightDescriptor flightDescriptor = FlightDescriptor.path(
new String(action.getBody(), StandardCharsets.UTF_8));
switch (action.getType()) {
case "DELETE":
if (datasets.remove(flightDescriptor) != null) {
Result result = new Result("Delete completed".getBytes(StandardCharsets.UTF_8));
listener.onNext(result);
} else {
Result result = new Result("Delete not completed. Reason: Key did not exist."
.getBytes(StandardCharsets.UTF_8));
listener.onNext(result);
}
listener.onCompleted();
}
}
// Client
Iterator<Result> deleteActionResult = flightClient.doAction(new Action("DELETE",
FlightDescriptor.path("profiles").getPath().get(0).getBytes(StandardCharsets.UTF_8)));
while (deleteActionResult.hasNext()) {
Result result = deleteActionResult.next();
System.out.println("C6: Client (Do Delete Action): " +
new String(result.getBody(), StandardCharsets.UTF_8));
}
C6: Client (Do Delete Action): Delete completed
验证删除数据¶
并确认它已被删除。
// Server
public void listFlights(CallContext context, Criteria criteria, StreamListener<FlightInfo> listener) {
datasets.forEach((k, v) -> { listener.onNext(getFlightInfo(null, k)); });
listener.onCompleted();
}
// Client
Iterable<FlightInfo> flightInfos = flightClient.listFlights(Criteria.ALL);
flightInfos.forEach(t -> System.out.println(t));
System.out.println("C7: Client (List Flights Info): After delete - No records");
C7: Client (List Flights Info): After delete - No records
停止 Flight 服务器¶
// Server
flightServer.shutdown();
System.out.println("C8: Server shut down successfully");
C8: Server shut down successfully
Arrow Flight RPC: https://arrow.apache.org/docs/format/Flight.html