背景
本文基于Spark 4.0
总结
Spark中的 VariantType 类型,用尽量少的字节来存储Json的格式化数据
分析
这里主要介绍 Variant 的存储,我们从VariantBuilder.buildJson
方法(把对应的json数据存储为VariantType
类型)开始:
public static Variant parseJson(JsonParser parser, boolean allowDuplicateKeys)
throws IOException {
VariantBuilder builder = new VariantBuilder(allowDuplicateKeys);
builder.buildJson(parser);
return builder.result();
}
这个方法会调用buildJson
这个方法:
JsonToken token = parser.currentToken();
if (token == null) {
throw new JsonParseException(parser, "Unexpected null token");
}
switch (token) {
case START_OBJECT: {
ArrayList<FieldEntry> fields = new ArrayList<>();
int start = writePos;
while (parser.nextToken() != JsonToken.END_OBJECT) {
String key = parser.currentName();
parser.nextToken();
int id = addKey(key);
fields.add(new FieldEntry(key, id, writePos - start));
buildJson(parser);
}
finishWritingObject(start, fields);
break;
}
case START_ARRAY: {
ArrayList<Integer> offsets = new ArrayList<>();
int start = writePos;
while (parser.nextToken() != JsonToken.END_ARRAY) {
offsets.add(writePos - start);
buildJson(parser);
}
finishWritingArray(start, offsets);
break;
}
case VALUE_STRING:
appendString(parser.getText());
break;
case VALUE_NUMBER_INT:
try {
appendLong(parser.getLongValue(