Merge "Reduce libziparchive internal hashtable memory size"

commit: 50947c1b29780769aca5e9482c9a3119943a43cd [log] [tgz]
author: Treehugger Robot <treehugger-gerrit@google.com> Thu Sep 20 06:45:33 2018 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Thu Sep 20 06:45:33 2018 +0000
tree: 5b5c769115e061d3cee4001975f1f18e38e0953f
parent: b649d39bd7bfd6084e43c96a8b88fc30ba3e8b2d [diff]
parent: 5a503efaf03227a99f54f50657c922c9e8a6af5e [diff]
diff --git a/libziparchive/zip_archive.cc b/libziparchive/zip_archive.cc
index add6e14..fb300a7 100644
--- a/libziparchive/zip_archive.cc
+++ b/libziparchive/zip_archive.cc

@@ -122,21 +122,36 @@
 #endif
 }
 
+static bool isZipStringEqual(const uint8_t* start, const ZipString& zip_string,
+                             const ZipStringOffset& zip_string_offset) {
+  const ZipString from_offset = zip_string_offset.GetZipString(start);
+  return from_offset == zip_string;
+}
+
+/**
+ * Returns offset of ZipString#name from the start of the central directory in the memory map.
+ * For valid ZipStrings contained in the zip archive mmap, 0 < offset < 0xffffff.
+ */
+static inline uint32_t GetOffset(const uint8_t* name, const uint8_t* start) {
+  CHECK_GT(name, start);
+  CHECK_LT(name, start + 0xffffff);
+  return static_cast<uint32_t>(name - start);
+}
+
 /*
  * Convert a ZipEntry to a hash table index, verifying that it's in a
  * valid range.
  */
-static int64_t EntryToIndex(const ZipString* hash_table, const uint32_t hash_table_size,
-                            const ZipString& name) {
+static int64_t EntryToIndex(const ZipStringOffset* hash_table, const uint32_t hash_table_size,
+                            const ZipString& name, const uint8_t* start) {
   const uint32_t hash = ComputeHash(name);
 
   // NOTE: (hash_table_size - 1) is guaranteed to be non-negative.
   uint32_t ent = hash & (hash_table_size - 1);
-  while (hash_table[ent].name != NULL) {
-    if (hash_table[ent] == name) {
+  while (hash_table[ent].name_offset != 0) {
+    if (isZipStringEqual(start, name, hash_table[ent])) {
       return ent;
     }
-
     ent = (ent + 1) & (hash_table_size - 1);
   }
 
@@ -147,8 +162,8 @@
 /*
  * Add a new entry to the hash table.
  */
-static int32_t AddToHash(ZipString* hash_table, const uint64_t hash_table_size,
-                         const ZipString& name) {
+static int32_t AddToHash(ZipStringOffset* hash_table, const uint64_t hash_table_size,
+                         const ZipString& name, const uint8_t* start) {
   const uint64_t hash = ComputeHash(name);
   uint32_t ent = hash & (hash_table_size - 1);
 
@@ -156,16 +171,15 @@
    * We over-allocated the table, so we're guaranteed to find an empty slot.
    * Further, we guarantee that the hashtable size is not 0.
    */
-  while (hash_table[ent].name != NULL) {
-    if (hash_table[ent] == name) {
+  while (hash_table[ent].name_offset != 0) {
+    if (isZipStringEqual(start, name, hash_table[ent])) {
       // We've found a duplicate entry. We don't accept it
       ALOGW("Zip: Found duplicate entry %.*s", name.name_length, name.name);
       return kDuplicateEntry;
     }
     ent = (ent + 1) & (hash_table_size - 1);
   }
-
-  hash_table[ent].name = name.name;
+  hash_table[ent].name_offset = GetOffset(name.name, start);
   hash_table[ent].name_length = name.name_length;
   return 0;
 }
@@ -209,7 +223,8 @@
 }
 
 static int32_t MapCentralDirectory0(const char* debug_file_name, ZipArchive* archive,
-                                    off64_t file_length, off64_t read_amount, uint8_t* scan_buffer) {
+                                    off64_t file_length, off64_t read_amount,
+                                    uint8_t* scan_buffer) {
   const off64_t search_start = file_length - read_amount;
 
   if (!archive->mapped_zip.ReadAtOffset(scan_buffer, read_amount, search_start)) {
@@ -362,7 +377,7 @@
    */
   archive->hash_table_size = RoundUpPower2(1 + (num_entries * 4) / 3);
   archive->hash_table =
-      reinterpret_cast<ZipString*>(calloc(archive->hash_table_size, sizeof(ZipString)));
+      reinterpret_cast<ZipStringOffset*>(calloc(archive->hash_table_size, sizeof(ZipStringOffset)));
   if (archive->hash_table == nullptr) {
     ALOGW("Zip: unable to allocate the %u-entry hash_table, entry size: %zu",
           archive->hash_table_size, sizeof(ZipString));
@@ -418,7 +433,8 @@
     ZipString entry_name;
     entry_name.name = file_name;
     entry_name.name_length = file_name_length;
-    const int add_result = AddToHash(archive->hash_table, archive->hash_table_size, entry_name);
+    const int add_result = AddToHash(archive->hash_table, archive->hash_table_size, entry_name,
+                                     archive->central_directory.GetBasePtr());
     if (add_result != 0) {
       ALOGW("Zip: Error adding entry to hash table %d", add_result);
       return add_result;
@@ -538,7 +554,9 @@
   // Recover the start of the central directory entry from the filename
   // pointer.  The filename is the first entry past the fixed-size data,
   // so we can just subtract back from that.
-  const uint8_t* ptr = archive->hash_table[ent].name;
+  const ZipString from_offset =
+      archive->hash_table[ent].GetZipString(archive->central_directory.GetBasePtr());
+  const uint8_t* ptr = from_offset.name;
   ptr -= sizeof(CentralDirectoryRecord);
 
   // This is the base of our mmapped region, we have to sanity check that
@@ -648,8 +666,9 @@
       ALOGW("Zip: failed reading lfh name from offset %" PRId64, static_cast<int64_t>(name_offset));
       return kIoError;
     }
-
-    if (memcmp(archive->hash_table[ent].name, name_buf.data(), nameLen)) {
+    const ZipString from_offset =
+        archive->hash_table[ent].GetZipString(archive->central_directory.GetBasePtr());
+    if (memcmp(from_offset.name, name_buf.data(), nameLen)) {
       return kInconsistentInformation;
     }
 
@@ -747,19 +766,19 @@
     return kInvalidEntryName;
   }
 
-  const int64_t ent = EntryToIndex(archive->hash_table, archive->hash_table_size, entryName);
-
+  const int64_t ent = EntryToIndex(archive->hash_table, archive->hash_table_size, entryName,
+                                   archive->central_directory.GetBasePtr());
   if (ent < 0) {
     ALOGV("Zip: Could not find entry %.*s", entryName.name_length, entryName.name);
     return ent;
   }
-
   return FindEntry(archive, ent, data);
 }
 
 int32_t Next(void* cookie, ZipEntry* data, ZipString* name) {
   IterationHandle* handle = reinterpret_cast<IterationHandle*>(cookie);
   if (handle == NULL) {
+    ALOGW("Zip: Null ZipArchiveHandle");
     return kInvalidHandle;
   }
 
@@ -771,19 +790,19 @@
 
   const uint32_t currentOffset = handle->position;
   const uint32_t hash_table_length = archive->hash_table_size;
-  const ZipString* hash_table = archive->hash_table;
-
+  const ZipStringOffset* hash_table = archive->hash_table;
   for (uint32_t i = currentOffset; i < hash_table_length; ++i) {
-    if (hash_table[i].name != NULL &&
-        (handle->prefix.name_length == 0 || hash_table[i].StartsWith(handle->prefix)) &&
-        (handle->suffix.name_length == 0 || hash_table[i].EndsWith(handle->suffix))) {
+    const ZipString from_offset =
+        hash_table[i].GetZipString(archive->central_directory.GetBasePtr());
+    if (hash_table[i].name_offset != 0 &&
+        (handle->prefix.name_length == 0 || from_offset.StartsWith(handle->prefix)) &&
+        (handle->suffix.name_length == 0 || from_offset.EndsWith(handle->suffix))) {
       handle->position = (i + 1);
       const int error = FindEntry(archive, i, data);
       if (!error) {
-        name->name = hash_table[i].name;
+        name->name = from_offset.name;
         name->name_length = hash_table[i].name_length;
       }
-
       return error;
     }
   }

diff --git a/libziparchive/zip_archive_private.h b/libziparchive/zip_archive_private.h
index 0a73300..83cb11f 100644
--- a/libziparchive/zip_archive_private.h
+++ b/libziparchive/zip_archive_private.h

@@ -136,6 +136,26 @@
   size_t length_;
 };
 
+/**
+ * More space efficient string representation of strings in an mmaped zipped file than
+ * std::string_view or ZipString. Using ZipString as an entry in the ZipArchive hashtable wastes
+ * space. ZipString stores a pointer to a string (on 64 bit, 8 bytes) and the length to read from
+ * that pointer, 2 bytes. Because of alignment, the structure consumes 16 bytes, wasting 6 bytes.
+ * ZipStringOffset stores a 4 byte offset from a fixed location in the memory mapped file instead
+ * of the entire address, consuming 8 bytes with alignment.
+ */
+struct ZipStringOffset {
+  uint32_t name_offset;
+  uint16_t name_length;
+
+  const ZipString GetZipString(const uint8_t* start) const {
+    ZipString zip_string;
+    zip_string.name = start + name_offset;
+    zip_string.name_length = name_length;
+    return zip_string;
+  }
+};
+
 struct ZipArchive {
   // open Zip archive
   mutable MappedZipFile mapped_zip;
@@ -154,7 +174,7 @@
   // allocate so the maximum number entries can never be higher than
   // ((4 * UINT16_MAX) / 3 + 1) which can safely fit into a uint32_t.
   uint32_t hash_table_size;
-  ZipString* hash_table;
+  ZipStringOffset* hash_table;
 
   ZipArchive(const int fd, bool assume_ownership);
   ZipArchive(void* address, size_t length);
commit	50947c1b29780769aca5e9482c9a3119943a43cd	[log] [tgz]
author	Treehugger Robot <treehugger-gerrit@google.com>	Thu Sep 20 06:45:33 2018 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Thu Sep 20 06:45:33 2018 +0000
tree	5b5c769115e061d3cee4001975f1f18e38e0953f
parent	b649d39bd7bfd6084e43c96a8b88fc30ba3e8b2d [diff]
parent	5a503efaf03227a99f54f50657c922c9e8a6af5e [diff]