数据操作¶
与比较、过滤或转换数据相关的食谱。
连接 VectorSchemaRoots¶
在某些情况下,需要将 VectorSchemaRoot 作为容器进行建模。为此,可以使用 VectorSchemaRootAppender.append
。以下代码创建两个根,然后将它们连接在一起
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.VectorSchemaRootAppender;
import static java.util.Arrays.asList;
Field column_one = new Field("column-one", FieldType.nullable(new ArrowType.Int(32, true)), null);
Schema schema = new Schema(asList(column_one));
try (
BufferAllocator allocator = new RootAllocator();
VectorSchemaRoot rootOne = VectorSchemaRoot.create(schema, allocator);
VectorSchemaRoot rootTwo = VectorSchemaRoot.create(schema, allocator);
VectorSchemaRoot result = VectorSchemaRoot.create(schema, allocator);
) {
IntVector appenderOne = (IntVector) rootOne.getVector(0);
rootOne.allocateNew();
appenderOne.set(0, 100);
appenderOne.set(1, 20);
rootOne.setRowCount(2);
IntVector appenderTwo = (IntVector) rootTwo.getVector(0);
rootTwo.allocateNew();
appenderTwo.set(0, 34);
appenderTwo.set(1, 75);
rootTwo.setRowCount(2);
result.allocateNew();
VectorSchemaRootAppender.append(result, rootOne, rootTwo);
System.out.print(result.contentToTSVString());
}
column-one
100
20
34
75
连接值向量¶
在某些情况下,我们需要将两个值向量连接成一个。为此,我们可以使用 VectorAppender。这会修改初始 ValueVector。
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.util.VectorAppender;
try (
BufferAllocator allocator = new RootAllocator();
IntVector initialValues = new IntVector("initialValues", allocator);
IntVector toAppend = new IntVector("toAppend", allocator);
) {
initialValues.allocateNew(2);
initialValues.set(0, 1);
initialValues.set(1, 2);
initialValues.setValueCount(2);
System.out.println("Initial IntVector: " + initialValues);
toAppend.allocateNew(4);
toAppend.set(1, 4);
toAppend.set(3, 6);
toAppend.setValueCount(4);
System.out.println("IntVector to Append: " + toAppend);
VectorAppender appenderUtil = new VectorAppender(initialValues);
toAppend.accept(appenderUtil, null);
System.out.println("IntVector Result: " + initialValues);
}
Initial IntVector: [1, 2]
IntVector to Append: [null, 4, null, 6]
IntVector Result: [1, 2, null, 4, null, 6]
比较向量以检查字段是否相等¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.compare.TypeEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector right = new IntVector("int", allocator);
) {
right.allocateNew(3);
right.set(0, 10);
right.set(1, 20);
right.set(2, 30);
right.setValueCount(3);
IntVector left1 = new IntVector("int", allocator);
IntVector left2 = new IntVector("int2", allocator);
TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);
System.out.println(visitor.equals(left1));
System.out.println(visitor.equals(left2));
}
true
false
比较向量是否相等¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.compare.VectorEqualsVisitor;
try(
BufferAllocator allocator = new RootAllocator();
IntVector vector1 = new IntVector("vector1", allocator);
IntVector vector2 = new IntVector("vector1", allocator);
IntVector vector3 = new IntVector("vector1", allocator)
) {
vector1.allocateNew(1);
vector1.set(0, 10);
vector1.setValueCount(1);
vector2.allocateNew(1);
vector2.set(0, 10);
vector2.setValueCount(1);
vector3.allocateNew(1);
vector3.set(0, 20);
vector3.setValueCount(1);
VectorEqualsVisitor visitor = new VectorEqualsVisitor();
System.out.println(visitor.vectorEquals(vector1, vector2));
System.out.println(visitor.vectorEquals(vector1, vector3));
}
true
false
比较数组上的值¶
比较向量中给定索引处的两个值
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
VarCharVector vec = new VarCharVector("valueindexcomparator", allocator);
) {
vec.allocateNew(3);
vec.setValueCount(3);
vec.set(0, "ba".getBytes());
vec.set(1, "abc".getBytes());
vec.set(2, "aa".getBytes());
VectorValueComparator<VarCharVector> valueComparator = DefaultVectorComparators.createDefaultComparator(vec);
valueComparator.attachVector(vec);
System.out.println(valueComparator.compare(0, 1) > 0);
System.out.println(valueComparator.compare(1, 2) < 0);
}
true
false
请注意,如果我们需要自己的比较器,我们可以扩展 VectorValueComparator 并根据需要覆盖 compareNotNull 方法
在数组上搜索值¶
线性搜索 - O(n)¶
算法:org.apache.arrow.algorithm.search.VectorSearcher#linearSearch - O(n)
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector linearSearchVector = new IntVector("linearSearchVector", allocator);
) {
linearSearchVector.allocateNew(10);
linearSearchVector.setValueCount(10);
for (int i = 0; i < 10; i++) {
linearSearchVector.set(i, i);
}
VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(linearSearchVector);
int result = VectorSearcher.linearSearch(linearSearchVector, comparatorInt, linearSearchVector, 3);
System.out.println(result);
}
3
二分搜索 - O(log(n))¶
算法:org.apache.arrow.algorithm.search.VectorSearcher#binarySearch - O(log(n))
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector binarySearchVector = new IntVector("", allocator);
) {
binarySearchVector.allocateNew(10);
binarySearchVector.setValueCount(10);
for (int i = 0; i < 10; i++) {
binarySearchVector.set(i, i);
}
VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(binarySearchVector);
int result = VectorSearcher.binarySearch(binarySearchVector, comparatorInt, binarySearchVector, 3);
System.out.println(result);
}
3
对数组上的值进行排序¶
就地排序器 - O(nlog(n))¶
通过操作原始向量进行排序。算法:org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter - O(nlog(n))
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
) {
intVectorNotSorted.allocateNew(3);
intVectorNotSorted.setValueCount(3);
intVectorNotSorted.set(0, 10);
intVectorNotSorted.set(1, 8);
intVectorNotSorted.setNull(2);
FixedWidthInPlaceVectorSorter<IntVector> sorter = new FixedWidthInPlaceVectorSorter<IntVector>();
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
sorter.sortInPlace(intVectorNotSorted, comparator);
System.out.println(intVectorNotSorted);
}
[null, 8, 10]
非就地排序器 - O(nlog(n))¶
通过将向量元素复制到新向量中以排序顺序进行排序 - O(nlog(n)) 算法:: org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter. FixedWidthOutOfPlaceVectorSorter & VariableWidthOutOfPlaceVectorSor
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthOutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.OutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
IntVector intVectorSorted = (IntVector) intVectorNotSorted.getField()
.getFieldType().createNewSingleVector("new-out-of-place-sorter",
allocator, null);
) {
intVectorNotSorted.allocateNew(3);
intVectorNotSorted.setValueCount(3);
intVectorNotSorted.set(0, 10);
intVectorNotSorted.set(1, 8);
intVectorNotSorted.setNull(2);
OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new FixedWidthOutOfPlaceVectorSorter<>();
VectorValueComparator<IntVector> comparatorOutOfPlaceSorter = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted, intVectorSorted, comparatorOutOfPlaceSorter);
System.out.println(intVectorSorted);
}
[null, 8, 10]