ChunkReader

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.maven.index.reader;

import java.io.Closeable;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UTFDataFormatException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;

/**
 * Maven Index published binary chunk reader, it reads raw Maven Indexer records from the transport binary format.
 * Instances of this class MUST BE handled as resources (have them closed once done with them), it is user
 * responsibility to close them, ideally in try-with-resource block.
 *
 * @since 5.1.2
 */
public class ChunkReader implements Closeable, Iterable<Map<String, String>> {
    private final String chunkName;

    private final DataInputStream dataInputStream;

    private final int version;

    private final Date timestamp;

    public ChunkReader(final String chunkName, final InputStream inputStream) throws IOException {
        this.chunkName = chunkName.trim();
        this.dataInputStream = new DataInputStream(new GZIPInputStream(inputStream, 2 * 1024));
        this.version = ((int) dataInputStream.readByte()) & 0xff;
        this.timestamp = new Date(dataInputStream.readLong());
    }

    /**
     * Returns the chunk name.
     */
    public String getName() {
        return chunkName;
    }

    /**
     * Returns index version. All releases so far always returned {@code 1}.
     */
    public int getVersion() {
        return version;
    }

    /**
     * Returns the index timestamp of last update of the index.
     */
    public Date getTimestamp() {
        return timestamp;
    }

    /**
     * Returns the {@link Record} iterator.
     */
    @Override
    public Iterator<Map<String, String>> iterator() {
        try {
            return new IndexIterator(dataInputStream);
        } catch (IOException e) {
            throw new RuntimeException("error", e);
        }
    }

    /**
     * Closes this reader and it's underlying input.
     */
    @Override
    public void close() throws IOException {
        dataInputStream.close();
    }

    /**
     * Low memory footprint index iterator that incrementally parses the underlying stream.
     */
    private static class IndexIterator implements Iterator<Map<String, String>> {
        private final DataInputStream dataInputStream;

        private Map<String, String> nextRecord;

        private IndexIterator(final DataInputStream dataInputStream) throws IOException {
            this.dataInputStream = dataInputStream;
            this.nextRecord = nextRecord();
        }

        @Override
        public boolean hasNext() {
            return nextRecord != null;
        }

        @Override
        public Map<String, String> next() {
            if (nextRecord == null) {
                throw new NoSuchElementException("chunk depleted");
            }
            Map<String, String> result = nextRecord;
            nextRecord = nextRecord();
            return result;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("remove");
        }

        private Map<String, String> nextRecord() {
            try {
                return readRecord(dataInputStream);
            } catch (IOException e) {
                throw new RuntimeException("read error", e);
            }
        }
    }

    /**
     * Reads and returns next record from the underlying stream, or {@code null} if no more records.
     */
    private static Map<String, String> readRecord(final DataInput dataInput) throws IOException {
        int fieldCount;
        try {
            fieldCount = dataInput.readInt();
        } catch (EOFException ex) {
            return null; // no more documents
        }

        Map<String, String> recordMap = new HashMap<>();
        for (int i = 0; i < fieldCount; i++) {
            readField(recordMap, dataInput);
        }
        return recordMap;
    }

    private static void readField(final Map<String, String> record, final DataInput dataInput) throws IOException {
        dataInput.readByte(); // flags: neglect them
        String name = dataInput.readUTF();
        String value = readUTF(dataInput);
        record.put(name, value);
    }

    private static String readUTF(final DataInput dataInput) throws IOException {
        int utflen = dataInput.readInt();

        byte[] bytearr;
        char[] chararr;

        try {
            bytearr = new byte[utflen];
            chararr = new char[utflen];
        } catch (OutOfMemoryError e) {
            throw new IOException("Index data content is corrupt", e);
        }

        int c, char2, char3;
        int count = 0;
        int chararrCount = 0;

        dataInput.readFully(bytearr, 0, utflen);

        while (count < utflen) {
            c = bytearr[count] & 0xff;
            if (c > 127) {
                break;
            }
            count++;
            chararr[chararrCount++] = (char) c;
        }

        while (count < utflen) {
            c = bytearr[count] & 0xff;
            switch (c >> 4) {
                case 0:
                case 1:
                case 2:
                case 3:
                case 4:
                case 5:
                case 6:
                case 7:
                    /* 0xxxxxxx */
                    count++;
                    chararr[chararrCount++] = (char) c;
                    break;

                case 12:
                case 13:
                    /* 110x xxxx 10xx xxxx */
                    count += 2;
                    if (count > utflen) {
                        throw new UTFDataFormatException("malformed input: partial character at end");
                    }
                    char2 = bytearr[count - 1];
                    if ((char2 & 0xC0) != 0x80) {
                        throw new UTFDataFormatException("malformed input around byte " + count);
                    }
                    chararr[chararrCount++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
                    break;

                case 14:
                    /* 1110 xxxx 10xx xxxx 10xx xxxx */
                    count += 3;
                    if (count > utflen) {
                        throw new UTFDataFormatException("malformed input: partial character at end");
                    }
                    char2 = bytearr[count - 2];
                    char3 = bytearr[count - 1];
                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
                        throw new UTFDataFormatException("malformed input around byte " + (count - 1));
                    }
                    chararr[chararrCount++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F));
                    break;

                default:
                    /* 10xx xxxx, 1111 xxxx */
                    throw new UTFDataFormatException("malformed input around byte " + count);
            }
        }

        // The number of chars produced may be less than utflen
        return new String(chararr, 0, chararrCount);
    }
}