From aed09b152a2fce9b6599ee938e6aa79029fe96b5 Mon Sep 17 00:00:00 2001
From: Nicolas Raynaud <github@nraynaud.com>
Date: Mon, 28 Sep 2020 15:42:55 +0200
Subject: [PATCH] fix(import/ova): speedup the import of gziped vmdk disks
 nested in .ova (#5275)

This the followup to #5085

Avoid unzipping the entire file from the beginning before each read.
The test case when from 10min down to 26 seconds.

When reading a block from the gzipped file, we keep the current state in memory, if the next read happens at an offset greater than the previous read, we just carry one decompressing the file until the desired position.

The previous code would decompress from the start of the file for every read operation.
---
 CHANGELOG.unreleased.md            |  1 +
 packages/xo-vmdk-to-vhd/src/ova.js | 84 +++++++++++++++++-------------
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.unreleased.md b/CHANGELOG.unreleased.md
index 87a3a040c..09648a665 100644
--- a/CHANGELOG.unreleased.md
+++ b/CHANGELOG.unreleased.md
@@ -22,6 +22,7 @@
 - [API] Fix `this.removeSubjectFromResourceSet is not a function` error on calling `resourceSet.removeSubject` via `xo-cli` [#5265](https://github.com/vatesfr/xen-orchestra/issues/5265) (PR [#5266](https://github.com/vatesfr/xen-orchestra/pull/5266))
 - [Import OVA] Fix frozen UI when dropping a big OVA on the page (PR [#5274](https://github.com/vatesfr/xen-orchestra/pull/5274))
 - [Remotes/S3] Fix S3 backup of 50GB+ files [#5197](https://github.com/vatesfr/xen-orchestra/issues/5197) (PR[ #5242](https://github.com/vatesfr/xen-orchestra/pull/5242) )
+- [Import OVA] Improve import speed of embedded gzipped VMDK disks (PR [#5275](https://github.com/vatesfr/xen-orchestra/pull/5275))
 - [Remotes] Fix editing bucket and directory for S3 remotes [#5233](https://github.com/vatesfr/xen-orchestra/issues/5233) (PR [5276](https://github.com/vatesfr/xen-orchestra/pull/5276))
 
 ### Packages to release
diff --git a/packages/xo-vmdk-to-vhd/src/ova.js b/packages/xo-vmdk-to-vhd/src/ova.js
index 4bc2532b9..28963cf42 100644
--- a/packages/xo-vmdk-to-vhd/src/ova.js
+++ b/packages/xo-vmdk-to-vhd/src/ova.js
@@ -1,3 +1,4 @@
+import assert from 'assert'
 import find from 'lodash/find'
 import forEach from 'lodash/forEach'
 import pako from 'pako'
@@ -95,7 +96,8 @@ function parseTarHeader(header, stringDeserializer) {
   const sizeBuffer = header.slice(124, 124 + 12)
   // size encoding: https://codeistry.wordpress.com/2014/08/14/how-to-parse-a-tar-file/
   let fileSize = 0
-  // If the leading byte is 0x80 (128), the non-leading bytes of the field are concatenated in big-endian order, with the result being a positive number expressed in binary form.
+  // If the leading byte is 0x80 (128), the non-leading bytes of the field are concatenated in big-endian order,
+  // with the result being a positive number expressed in binary form.
   //
   // Source: https://www.gnu.org/software/tar/manual/html_node/Extensions.html
   if (new Uint8Array(sizeBuffer)[0] === 128) {
@@ -218,41 +220,6 @@ async function parseOVF(fileFragment, stringDeserializer) {
 
 const GZIP_CHUNK_SIZE = 4 * 1024 * 1024
 
-async function parseGzipFromStart(start, end, fileSlice) {
-  let currentDeflatedPos = 0
-  let currentInflatedPos = 0
-  const inflate = new pako.Inflate()
-  const chunks = []
-  while (currentInflatedPos < end) {
-    const slice = fileSlice.slice(
-      currentDeflatedPos,
-      currentDeflatedPos + GZIP_CHUNK_SIZE
-    )
-    const compressed = await slice.read()
-    inflate.push(compressed, pako.Z_SYNC_FLUSH)
-    let chunk = inflate.result
-    const inflatedChunkEnd = currentInflatedPos + chunk.length
-    if (inflatedChunkEnd > start) {
-      if (currentInflatedPos < start) {
-        chunk = chunk.slice(start - currentInflatedPos)
-      }
-      if (inflatedChunkEnd > end) {
-        chunk = chunk.slice(0, -(inflatedChunkEnd - end))
-      }
-      chunks.push(chunk)
-    }
-    currentInflatedPos = inflatedChunkEnd
-    currentDeflatedPos += GZIP_CHUNK_SIZE
-  }
-  const resultBuffer = new Uint8Array(sum(chunks.map(c => c.length)))
-  let index = 0
-  chunks.forEach(c => {
-    resultBuffer.set(c, index)
-    index += c.length
-  })
-  return resultBuffer.buffer
-}
-
 // start and end are negative numbers
 // used with streamOptimized format where only the footer has the directory address filled
 async function parseGzipFromEnd(start, end, fileSlice, header) {
@@ -335,7 +302,52 @@ export async function parseOVAFile(
     }
     if (!skipVmdk && header.fileName.toLowerCase().endsWith('.vmdk.gz')) {
       const fileSlice = parsableFile.slice(offset, offset + header.fileSize)
+      let forwardsInflater = new pako.Inflate()
+
       const readFile = async (start, end) => {
+        // if next read is further down the stream than previous read, re-uses the previous zstream
+        async function parseGzipFromStart(start, end, fileSlice) {
+          const chunks = []
+          const resultStart = () =>
+            forwardsInflater.strm.total_out - forwardsInflater.result.length
+          if (forwardsInflater.result != null && start < resultStart()) {
+            // the block we are reading starts before the last decompressed chunk, reset stream
+            forwardsInflater = new pako.Inflate()
+          }
+          let isLast = false
+          while (true) {
+            if (forwardsInflater.strm.total_out > start) {
+              let chunk = forwardsInflater.result
+              if (resultStart() < start) {
+                chunk = chunk.slice(start - resultStart())
+              }
+              if (forwardsInflater.strm.total_out > end) {
+                chunk = chunk.slice(0, -(forwardsInflater.strm.total_out - end))
+                isLast = true
+              }
+              chunks.push(chunk)
+            }
+            if (isLast) {
+              // don't move the stream forwards if we took our last chunk
+              // gives the next read operation an opportunity to read from the same position
+              break
+            }
+            const slice = fileSlice.slice(
+              forwardsInflater.strm.total_in,
+              forwardsInflater.strm.total_in + GZIP_CHUNK_SIZE
+            )
+            forwardsInflater.push(await slice.read(), pako.Z_SYNC_FLUSH)
+          }
+          const resultBuffer = new Uint8Array(sum(chunks.map(c => c.length)))
+          let index = 0
+          chunks.forEach(c => {
+            resultBuffer.set(c, index)
+            index += c.length
+          })
+          assert.strictEqual(resultBuffer.buffer.byteLength, end - start)
+          return resultBuffer.buffer
+        }
+
         if (start === end) {
           return new Uint8Array(0)
         }