Skip to content

Commit 918609f

Browse files
committed
PARQUET-286: Update String support to match upstream Avro.
This adds getStringableClass, which determines what String representation upstream Avro would use. Specific and reflect will use an alternative String class if java-class is set that is instantiated using a constructor that takes a String. Otherwise, reflect will always use String and both specific and generic will use Utf8 or String depending on whether avro.java.string is set to "string". The new string representations required two new converters: one for Utf8 and one for stringable classes (those with constructors that take a single String). The converters have also been refactored so that all binary converters now implement dictionary support. Author: Ryan Blue <blue@apache.org> Closes #201 from rdblue/PARQUET-286-avro-utf8-support and squashes the following commits: beb5a44 [Ryan Blue] PARQUET-286: Add tests, support for stringable map keys. 0e9240f [Ryan Blue] PARQUET-286: Update string support to match upstream Avro.
1 parent d6f082b commit 918609f

9 files changed

Lines changed: 645 additions & 80 deletions

File tree

parquet-avro/pom.xml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@
8484
</dependencies>
8585

8686
<build>
87+
<resources>
88+
<resource>
89+
<directory>src/test/avro</directory>
90+
</resource>
91+
<resource>
92+
<directory>src/main/resources</directory>
93+
</resource>
94+
</resources>
8795
<plugins>
8896
<plugin>
8997
<artifactId>maven-enforcer-plugin</artifactId>
@@ -97,7 +105,15 @@
97105
<artifactId>avro-maven-plugin</artifactId>
98106
<version>${avro.version}</version>
99107
<executions>
108+
<execution>
109+
<id>compile-avsc</id>
110+
<phase>generate-test-sources</phase>
111+
<goals>
112+
<goal>schema</goal>
113+
</goals>
114+
</execution>
100115
<execution>
116+
<id>compile-idl</id>
101117
<phase>generate-test-sources</phase>
102118
<goals>
103119
<goal>idl-protocol</goal>

parquet-avro/src/main/java/org/apache/parquet/avro/AvroConverters.java

Lines changed: 89 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,14 @@
1818
*/
1919
package org.apache.parquet.avro;
2020

21+
import java.lang.reflect.Constructor;
22+
import java.lang.reflect.InvocationTargetException;
2123
import java.nio.ByteBuffer;
2224
import org.apache.avro.Schema;
2325
import org.apache.avro.generic.GenericData;
26+
import org.apache.avro.util.Utf8;
2427
import org.apache.parquet.column.Dictionary;
28+
import org.apache.parquet.io.ParquetDecodingException;
2529
import org.apache.parquet.io.api.Binary;
2630
import org.apache.parquet.io.api.GroupConverter;
2731
import org.apache.parquet.io.api.PrimitiveConverter;
@@ -44,6 +48,40 @@ public AvroPrimitiveConverter(ParentValueContainer parent) {
4448
}
4549
}
4650

51+
abstract static class BinaryConverter<T> extends AvroPrimitiveConverter {
52+
private T[] dict = null;
53+
54+
public BinaryConverter(ParentValueContainer parent) {
55+
super(parent);
56+
}
57+
58+
public abstract T convert(Binary binary);
59+
60+
@Override
61+
public void addBinary(Binary value) {
62+
parent.add(convert(value));
63+
}
64+
65+
@Override
66+
public boolean hasDictionarySupport() {
67+
return true;
68+
}
69+
70+
@Override
71+
@SuppressWarnings("unchecked")
72+
public void setDictionary(Dictionary dictionary) {
73+
dict = (T[]) new Object[dictionary.getMaxId() + 1];
74+
for (int i = 0; i <= dictionary.getMaxId(); i++) {
75+
dict[i] = convert(dictionary.decodeToBinary(i));
76+
}
77+
}
78+
79+
@Override
80+
public void addValueFromDictionary(int dictionaryId) {
81+
parent.add(dict[dictionaryId]);
82+
}
83+
}
84+
4785
static final class FieldByteConverter extends AvroPrimitiveConverter {
4886
public FieldByteConverter(ParentValueContainer parent) {
4987
super(parent);
@@ -54,6 +92,7 @@ public void addInt(int value) {
5492
parent.addByte((byte) value);
5593
}
5694
}
95+
5796
static final class FieldShortConverter extends AvroPrimitiveConverter {
5897
public FieldShortConverter(ParentValueContainer parent) {
5998
super(parent);
@@ -133,7 +172,6 @@ final public void addLong(long value) {
133172
final public void addFloat(float value) {
134173
parent.addFloat(value);
135174
}
136-
137175
}
138176

139177
static final class FieldDoubleConverter extends AvroPrimitiveConverter {
@@ -162,62 +200,84 @@ final public void addDouble(double value) {
162200
}
163201
}
164202

165-
static final class FieldByteArrayConverter extends AvroPrimitiveConverter {
203+
static final class FieldByteArrayConverter extends BinaryConverter<byte[]> {
166204
public FieldByteArrayConverter(ParentValueContainer parent) {
167205
super(parent);
168206
}
169207

170208
@Override
171-
final public void addBinary(Binary value) {
172-
parent.add(value.getBytes());
209+
public byte[] convert(Binary binary) {
210+
return binary.getBytes();
173211
}
174212
}
175213

176-
static final class FieldByteBufferConverter extends AvroPrimitiveConverter {
214+
static final class FieldByteBufferConverter extends BinaryConverter<ByteBuffer> {
177215
public FieldByteBufferConverter(ParentValueContainer parent) {
178216
super(parent);
179217
}
180218

181219
@Override
182-
final public void addBinary(Binary value) {
183-
parent.add(ByteBuffer.wrap(value.getBytes()));
220+
public ByteBuffer convert(Binary binary) {
221+
return ByteBuffer.wrap(binary.getBytes());
184222
}
185223
}
186224

187-
static final class FieldStringConverter extends AvroPrimitiveConverter {
188-
// TODO: dictionary support should be generic and provided by a parent
189-
// TODO: this always produces strings, but should respect avro.java.string
190-
private String[] dict;
191-
225+
static final class FieldStringConverter extends BinaryConverter<String> {
192226
public FieldStringConverter(ParentValueContainer parent) {
193227
super(parent);
194228
}
195229

196230
@Override
197-
final public void addBinary(Binary value) {
198-
parent.add(value.toStringUsingUTF8());
231+
public String convert(Binary binary) {
232+
return binary.toStringUsingUTF8();
199233
}
234+
}
200235

201-
@Override
202-
public boolean hasDictionarySupport() {
203-
return true;
236+
static final class FieldUTF8Converter extends BinaryConverter<Utf8> {
237+
public FieldUTF8Converter(ParentValueContainer parent) {
238+
super(parent);
204239
}
205240

206241
@Override
207-
public void setDictionary(Dictionary dictionary) {
208-
dict = new String[dictionary.getMaxId() + 1];
209-
for (int i = 0; i <= dictionary.getMaxId(); i++) {
210-
dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
242+
public Utf8 convert(Binary binary) {
243+
return new Utf8(binary.getBytes());
244+
}
245+
}
246+
247+
static final class FieldStringableConverter extends BinaryConverter<Object> {
248+
private final String stringableName;
249+
private final Constructor<?> ctor;
250+
251+
public FieldStringableConverter(ParentValueContainer parent,
252+
Class<?> stringableClass) {
253+
super(parent);
254+
stringableName = stringableClass.getName();
255+
try {
256+
this.ctor = stringableClass.getConstructor(String.class);
257+
} catch (NoSuchMethodException e) {
258+
throw new ParquetDecodingException(
259+
"Unable to get String constructor for " + stringableName, e);
211260
}
212261
}
213262

214263
@Override
215-
public void addValueFromDictionary(int dictionaryId) {
216-
parent.add(dict[dictionaryId]);
264+
public Object convert(Binary binary) {
265+
try {
266+
return ctor.newInstance(binary.toStringUsingUTF8());
267+
} catch (InstantiationException e) {
268+
throw new ParquetDecodingException(
269+
"Cannot convert binary to " + stringableName, e);
270+
} catch (IllegalAccessException e) {
271+
throw new ParquetDecodingException(
272+
"Cannot convert binary to " + stringableName, e);
273+
} catch (InvocationTargetException e) {
274+
throw new ParquetDecodingException(
275+
"Cannot convert binary to " + stringableName, e);
276+
}
217277
}
218278
}
219279

220-
static final class FieldEnumConverter extends AvroPrimitiveConverter {
280+
static final class FieldEnumConverter extends BinaryConverter<Object> {
221281
private final Schema schema;
222282
private final GenericData model;
223283

@@ -229,12 +289,12 @@ public FieldEnumConverter(ParentValueContainer parent, Schema enumSchema,
229289
}
230290

231291
@Override
232-
final public void addBinary(Binary value) {
233-
parent.add(model.createEnum(value.toStringUsingUTF8(), schema));
292+
public Object convert(Binary binary) {
293+
return model.createEnum(binary.toStringUsingUTF8(), schema);
234294
}
235295
}
236296

237-
static final class FieldFixedConverter extends AvroPrimitiveConverter {
297+
static final class FieldFixedConverter extends BinaryConverter<Object> {
238298
private final Schema schema;
239299
private final GenericData model;
240300

@@ -246,8 +306,8 @@ public FieldFixedConverter(ParentValueContainer parent, Schema avroSchema,
246306
}
247307

248308
@Override
249-
final public void addBinary(Binary value) {
250-
parent.add(model.createFixed(null /* reuse */, value.getBytes(), schema));
309+
public Object convert(Binary binary) {
310+
return model.createFixed(null /* reuse */, binary.getBytes(), schema);
251311
}
252312
}
253313
}

0 commit comments

Comments
 (0)