12 files changed, 1056 insertions, 189 deletions
diff --git a/gn/tests.gni b/gn/tests.gni
index 5b4594099b..a359ca3e3c 100644
--- a/gn/tests.gni
+++ b/gn/tests.gni
@@ -275,6 +275,7 @@ tests_sources = [
   "$_tests/UtilsTest.cpp",
   "$_tests/VerticesTest.cpp",
   "$_tests/VkBackendSurfaceTest.cpp",
+  "$_tests/VkHeapTests.cpp",
   "$_tests/VkMakeCopyPipelineTest.cpp",
   "$_tests/VkUploadPixelsTests.cpp",
   "$_tests/VkWrapTests.cpp",
diff --git a/include/gpu/vk/GrVkBackendContext.h b/include/gpu/vk/GrVkBackendContext.h
index 212362873a..fdc71d373f 100644
--- a/include/gpu/vk/GrVkBackendContext.h
+++ b/include/gpu/vk/GrVkBackendContext.h
@@ -13,8 +13,6 @@
 #include "vk/GrVkDefines.h"
 #include "vk/GrVkInterface.h"
 
-class GrVkMemoryAllocator;
-
 enum GrVkExtensionFlags {
     kEXT_debug_report_GrVkExtensionFlag    = 0x0001,
     kNV_glsl_shader_GrVkExtensionFlag      = 0x0002,
@@ -47,8 +45,6 @@ struct SK_API GrVkBackendContext : public SkRefCnt {
     uint32_t                   fExtensions;
     uint32_t                   fFeatures;
     sk_sp<const GrVkInterface> fInterface;
-    sk_sp<GrVkMemoryAllocator> fMemoryAllocator;
-
     /**
      * Controls whether this object destroys the instance and device upon destruction. The default
      * is temporarily 'true' to avoid breaking existing clients but will be changed to 'false'.
diff --git a/include/gpu/vk/GrVkTypes.h b/include/gpu/vk/GrVkTypes.h
index 9225e92778..2e31250324 100644
--- a/include/gpu/vk/GrVkTypes.h
+++ b/include/gpu/vk/GrVkTypes.h
@@ -10,7 +10,7 @@
 #define GrVkTypes_DEFINED
 
 #include "GrTypes.h"
-#include "GrVkDefines.h"
+#include "vk/GrVkDefines.h"
 
 /**
  * KHR_debug
diff --git a/src/gpu/vk/GrVkAMDMemoryAllocator.cpp b/src/gpu/vk/GrVkAMDMemoryAllocator.cpp
index 93e2fff494..0b838ece3a 100644
--- a/src/gpu/vk/GrVkAMDMemoryAllocator.cpp
+++ b/src/gpu/vk/GrVkAMDMemoryAllocator.cpp
@@ -8,7 +8,6 @@
 #include "GrVkAMDMemoryAllocator.h"
 
 #include "vk/GrVkInterface.h"
-#include "GrVkMemory.h"
 #include "GrVkUtil.h"
 
 GrVkAMDMemoryAllocator::GrVkAMDMemoryAllocator(VkPhysicalDevice physicalDevice,
@@ -43,10 +42,7 @@ GrVkAMDMemoryAllocator::GrVkAMDMemoryAllocator(VkPhysicalDevice physicalDevice,
     info.flags = 0;
     info.physicalDevice = physicalDevice;
     info.device = device;
-    // Manually testing runs of dm using 64 here instead of the default 256 shows less memory usage
-    // on average. Also dm seems to run faster using 64 so it doesn't seem to be trading off speed
-    // for memory.
-    info.preferredLargeHeapBlockSize = 64*1024*1024;
+    info.preferredLargeHeapBlockSize = 0;
     info.pAllocationCallbacks = nullptr;
     info.pDeviceMemoryCallbacks = nullptr;
     info.frameInUseCount = 0;
@@ -202,9 +198,24 @@ void GrVkAMDMemoryAllocator::flushMappedMemory(const GrVkBackendMemory& memoryHa
         vmaGetPhysicalDeviceProperties(fAllocator, &physDevProps);
         VkDeviceSize alignment = physDevProps->limits.nonCoherentAtomSize;
 
+        offset = offset + info.fOffset;
+        VkDeviceSize offsetDiff = offset & (alignment -1);
+        offset = offset - offsetDiff;
+        size = (size + alignment - 1) & ~(alignment - 1);
+#ifdef SK_DEBUG
+        SkASSERT(offset >= info.fOffset);
+        SkASSERT(offset + size <= info.fOffset + info.fSize);
+        SkASSERT(0 == (offset & (alignment-1)));
+        SkASSERT(size > 0);
+        SkASSERT(0 == (size & (alignment-1)));
+#endif
+
         VkMappedMemoryRange mappedMemoryRange;
-        GrVkMemory::GetNonCoherentMappedMemoryRange(info, offset, size, alignment,
-                                                    &mappedMemoryRange);
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = info.fMemory;
+        mappedMemoryRange.offset = offset;
+        mappedMemoryRange.size = size;
         GR_VK_CALL(fInterface, FlushMappedMemoryRanges(fDevice, 1, &mappedMemoryRange));
     }
 }
@@ -220,9 +231,24 @@ void GrVkAMDMemoryAllocator::invalidateMappedMemory(const GrVkBackendMemory& mem
         vmaGetPhysicalDeviceProperties(fAllocator, &physDevProps);
         VkDeviceSize alignment = physDevProps->limits.nonCoherentAtomSize;
 
+        offset = offset + info.fOffset;
+        VkDeviceSize offsetDiff = offset & (alignment -1);
+        offset = offset - offsetDiff;
+        size = (size + alignment - 1) & ~(alignment - 1);
+#ifdef SK_DEBUG
+        SkASSERT(offset >= info.fOffset);
+        SkASSERT(offset + size <= info.fOffset + info.fSize);
+        SkASSERT(0 == (offset & (alignment-1)));
+        SkASSERT(size > 0);
+        SkASSERT(0 == (size & (alignment-1)));
+#endif
+
         VkMappedMemoryRange mappedMemoryRange;
-        GrVkMemory::GetNonCoherentMappedMemoryRange(info, offset, size, alignment,
-                                                    &mappedMemoryRange);
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = info.fMemory;
+        mappedMemoryRange.offset = offset;
+        mappedMemoryRange.size = size;
         GR_VK_CALL(fInterface, InvalidateMappedMemoryRanges(fDevice, 1, &mappedMemoryRange));
     }
 }
diff --git a/src/gpu/vk/GrVkBackendContext.cpp b/src/gpu/vk/GrVkBackendContext.cpp
index 196b141493..269a8911e4 100644
--- a/src/gpu/vk/GrVkBackendContext.cpp
+++ b/src/gpu/vk/GrVkBackendContext.cpp
@@ -8,7 +8,7 @@
 #include "SkAutoMalloc.h"
 #include "vk/GrVkBackendContext.h"
 #include "vk/GrVkExtensions.h"
-#include "vk/GrVkMemoryAllocator.h"
+#include "vk/GrVkInterface.h"
 #include "vk/GrVkUtil.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -323,7 +323,6 @@ const GrVkBackendContext* GrVkBackendContext::Create(uint32_t* presentQueueIndex
 }
 
 GrVkBackendContext::~GrVkBackendContext() {
-    fMemoryAllocator.reset();
     if (fInterface == nullptr || !fOwnsInstanceAndDevice) {
         return;
     }
diff --git a/src/gpu/vk/GrVkBuffer.cpp b/src/gpu/vk/GrVkBuffer.cpp
index b3c1d825aa..f65b15ded0 100644
--- a/src/gpu/vk/GrVkBuffer.cpp
+++ b/src/gpu/vk/GrVkBuffer.cpp
@@ -170,10 +170,28 @@ void GrVkBuffer::internalMap(GrVkGpu* gpu, size_t size, bool* createdNewBuffer)
     if (fDesc.fDynamic) {
         const GrVkAlloc& alloc = this->alloc();
         SkASSERT(alloc.fSize > 0);
-        SkASSERT(alloc.fSize >= size);
-        SkASSERT(0 == fOffset);
 
-        fMapPtr = GrVkMemory::MapAlloc(gpu, alloc);
+        // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+        // are aligned to the nonCoherentAtomSize limit. The offset should have been correctly
+        // aligned by our memory allocator. For size we pad out to make the range also aligned.
+        if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+            // Currently we always have the internal offset as 0.
+            SkASSERT(0 == fOffset);
+            VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
+            SkASSERT(0 == (alloc.fOffset & (alignment - 1)));
+
+            // Make size of the map aligned to nonCoherentAtomSize
+            size = (size + alignment - 1) & ~(alignment - 1);
+            fMappedSize = size;
+        }
+        SkASSERT(size + fOffset <= alloc.fSize);
+        VkResult err = VK_CALL(gpu, MapMemory(gpu->device(), alloc.fMemory,
+                                              alloc.fOffset + fOffset,
+                                              size, 0, &fMapPtr));
+        if (err) {
+            fMapPtr = nullptr;
+            fMappedSize = 0;
+        }
     } else {
         if (!fMapPtr) {
             fMapPtr = new unsigned char[this->size()];
@@ -188,15 +206,16 @@ void GrVkBuffer::internalUnmap(GrVkGpu* gpu, size_t size) {
     SkASSERT(this->vkIsMapped());
 
     if (fDesc.fDynamic) {
-        const GrVkAlloc& alloc = this->alloc();
-        SkASSERT(alloc.fSize > 0);
-        SkASSERT(alloc.fSize >= size);
         // We currently don't use fOffset
         SkASSERT(0 == fOffset);
+        VkDeviceSize flushOffset = this->alloc().fOffset + fOffset;
+        VkDeviceSize flushSize = gpu->vkCaps().canUseWholeSizeOnFlushMappedMemory() ? VK_WHOLE_SIZE
+                                                                                    : fMappedSize;
 
-        GrVkMemory::FlushMappedAlloc(gpu, alloc, 0, size);
-        GrVkMemory::UnmapAlloc(gpu, alloc);
+        GrVkMemory::FlushMappedAlloc(gpu, this->alloc(), flushOffset, flushSize);
+        VK_CALL(gpu, UnmapMemory(gpu->device(), this->alloc().fMemory));
         fMapPtr = nullptr;
+        fMappedSize = 0;
     } else {
         // vkCmdUpdateBuffer requires size < 64k and 4-byte alignment.
         // https://bugs.chromium.org/p/skia/issues/detail?id=7488
@@ -205,7 +224,7 @@ void GrVkBuffer::internalUnmap(GrVkGpu* gpu, size_t size) {
         } else {
             GrVkTransferBuffer* transferBuffer =
                     GrVkTransferBuffer::Create(gpu, size, GrVkBuffer::kCopyRead_Type);
-            if (!transferBuffer) {
+            if(!transferBuffer) {
                 return;
             }
 
diff --git a/src/gpu/vk/GrVkBuffer.h b/src/gpu/vk/GrVkBuffer.h
index 6d0c1fda9a..8d116a40f8 100644
--- a/src/gpu/vk/GrVkBuffer.h
+++ b/src/gpu/vk/GrVkBuffer.h
@@ -82,7 +82,7 @@ protected:
                                   const Desc& descriptor);
 
     GrVkBuffer(const Desc& desc, const GrVkBuffer::Resource* resource)
-        : fDesc(desc), fResource(resource), fOffset(0), fMapPtr(nullptr) {
+        : fDesc(desc), fResource(resource), fOffset(0), fMapPtr(nullptr), fMappedSize(0) {
     }
 
     void* vkMap(GrVkGpu* gpu) {
@@ -115,6 +115,9 @@ private:
     const Resource*         fResource;
     VkDeviceSize            fOffset;
     void*                   fMapPtr;
+    // On certain Intel devices/drivers there is a bug if we try to flush non-coherent memory and
+    // pass in VK_WHOLE_SIZE. Thus we track our mapped size and explicitly set it when calling flush
+    VkDeviceSize            fMappedSize;
 
     typedef SkNoncopyable INHERITED;
 };
diff --git a/src/gpu/vk/GrVkGpu.cpp b/src/gpu/vk/GrVkGpu.cpp
index 2525c5c16c..56d0b95bd0 100644
--- a/src/gpu/vk/GrVkGpu.cpp
+++ b/src/gpu/vk/GrVkGpu.cpp
@@ -17,7 +17,6 @@
 #include "GrRenderTargetPriv.h"
 #include "GrTexturePriv.h"
 
-#include "GrVkAMDMemoryAllocator.h"
 #include "GrVkCommandBuffer.h"
 #include "GrVkGpuCommandBuffer.h"
 #include "GrVkImage.h"
@@ -93,7 +92,6 @@ GrVkGpu::GrVkGpu(GrContext* context, const GrContextOptions& options,
                  sk_sp<const GrVkBackendContext> backendCtx)
         : INHERITED(context)
         , fBackendContext(std::move(backendCtx))
-        , fMemoryAllocator(fBackendContext->fMemoryAllocator)
         , fDevice(fBackendContext->fDevice)
         , fQueue(fBackendContext->fQueue)
         , fResourceProvider(this)
@@ -120,12 +118,6 @@ GrVkGpu::GrVkGpu(GrContext* context, const GrContextOptions& options,
     }
 #endif
 
-    if (!fMemoryAllocator) {
-        // We were not given a memory allocator at creation
-        fMemoryAllocator.reset(new GrVkAMDMemoryAllocator(fBackendContext->fPhysicalDevice,
-                                                          fDevice, fBackendContext->fInterface));
-    }
-
     fCompiler = new SkSL::Compiler();
 
     fVkCaps.reset(new GrVkCaps(options, this->vkInterface(), fBackendContext->fPhysicalDevice,
@@ -150,6 +142,17 @@ GrVkGpu::GrVkGpu(GrContext* context, const GrContextOptions& options,
     fCurrentCmdBuffer = fResourceProvider.findOrCreatePrimaryCommandBuffer();
     SkASSERT(fCurrentCmdBuffer);
     fCurrentCmdBuffer->begin(this);
+
+    // set up our heaps
+    fHeaps[kLinearImage_Heap].reset(new GrVkHeap(this, GrVkHeap::kSubAlloc_Strategy, 16*1024*1024));
+    fHeaps[kOptimalImage_Heap].reset(new GrVkHeap(this, GrVkHeap::kSubAlloc_Strategy, 64*1024*1024));
+    fHeaps[kSmallOptimalImage_Heap].reset(new GrVkHeap(this, GrVkHeap::kSubAlloc_Strategy, 2*1024*1024));
+    fHeaps[kVertexBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSingleAlloc_Strategy, 0));
+    fHeaps[kIndexBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSingleAlloc_Strategy, 0));
+    fHeaps[kUniformBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSubAlloc_Strategy, 256*1024));
+    fHeaps[kTexelBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSingleAlloc_Strategy, 0));
+    fHeaps[kCopyReadBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSingleAlloc_Strategy, 0));
+    fHeaps[kCopyWriteBuffer_Heap].reset(new GrVkHeap(this, GrVkHeap::kSubAlloc_Strategy, 16*1024*1024));
 }
 
 void GrVkGpu::destroyResources() {
@@ -559,6 +562,7 @@ bool GrVkGpu::uploadTexDataLinear(GrVkTexture* tex, GrSurfaceOrigin texOrigin, i
         0,  // arraySlice
     };
     VkSubresourceLayout layout;
+    VkResult err;
 
     const GrVkInterface* interface = this->vkInterface();
 
@@ -569,14 +573,28 @@ bool GrVkGpu::uploadTexDataLinear(GrVkTexture* tex, GrSurfaceOrigin texOrigin, i
 
     int texTop = kBottomLeft_GrSurfaceOrigin == texOrigin ? tex->height() - top - height : top;
     const GrVkAlloc& alloc = tex->alloc();
-    VkDeviceSize offset = texTop*layout.rowPitch + left*bpp;
+    VkDeviceSize offset = alloc.fOffset + texTop*layout.rowPitch + left*bpp;
+    VkDeviceSize offsetDiff = 0;
     VkDeviceSize size = height*layout.rowPitch;
-    SkASSERT(size + offset <= alloc.fSize);
-    void* mapPtr = GrVkMemory::MapAlloc(this, alloc);
-    if (!mapPtr) {
+    // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+    // are aligned to the nonCoherentAtomSize limit. We may have to move the initial offset back to
+    // meet the alignment requirements. So we track how far we move back and then adjust the mapped
+    // ptr back up so that this is opaque to the caller.
+    if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+        VkDeviceSize alignment = this->physicalDeviceProperties().limits.nonCoherentAtomSize;
+        offsetDiff = offset & (alignment - 1);
+        offset = offset - offsetDiff;
+        // Make size of the map aligned to nonCoherentAtomSize
+        size = (size + alignment - 1) & ~(alignment - 1);
+    }
+    SkASSERT(offset >= alloc.fOffset);
+    SkASSERT(size <= alloc.fOffset + alloc.fSize);
+    void* mapPtr;
+    err = GR_VK_CALL(interface, MapMemory(fDevice, alloc.fMemory, offset, size, 0, &mapPtr));
+    if (err) {
         return false;
     }
-    mapPtr = reinterpret_cast<char*>(mapPtr) + offset;
+    mapPtr = reinterpret_cast<char*>(mapPtr) + offsetDiff;
 
     if (kBottomLeft_GrSurfaceOrigin == texOrigin) {
         // copy into buffer by rows
@@ -593,7 +611,7 @@ bool GrVkGpu::uploadTexDataLinear(GrVkTexture* tex, GrSurfaceOrigin texOrigin, i
     }
 
     GrVkMemory::FlushMappedAlloc(this, alloc, offset, size);
-    GrVkMemory::UnmapAlloc(this, alloc);
+    GR_VK_CALL(interface, UnmapMemory(fDevice, alloc.fMemory));
 
     return true;
 }
@@ -1129,14 +1147,33 @@ GrStencilAttachment* GrVkGpu::createStencilAttachmentForRenderTarget(const GrRen
 
 bool copy_testing_data(GrVkGpu* gpu, const void* srcData, const GrVkAlloc& alloc,
                        size_t bufferOffset, size_t srcRowBytes, size_t dstRowBytes, int h) {
-    VkDeviceSize size = dstRowBytes * h;
-    VkDeviceSize offset = bufferOffset;
-    SkASSERT(size + offset <= alloc.fSize);
-    void* mapPtr = GrVkMemory::MapAlloc(gpu, alloc);
-    if (!mapPtr) {
+    // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+    // are aligned to the nonCoherentAtomSize limit. We may have to move the initial offset back to
+    // meet the alignment requirements. So we track how far we move back and then adjust the mapped
+    // ptr back up so that this is opaque to the caller.
+    VkDeviceSize mapSize = dstRowBytes * h;
+    VkDeviceSize mapOffset = alloc.fOffset + bufferOffset;
+    VkDeviceSize offsetDiff = 0;
+    if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+        VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
+        offsetDiff = mapOffset & (alignment - 1);
+        mapOffset = mapOffset - offsetDiff;
+        // Make size of the map aligned to nonCoherentAtomSize
+        mapSize = (mapSize + alignment - 1) & ~(alignment - 1);
+    }
+    SkASSERT(mapOffset >= alloc.fOffset);
+    SkASSERT(mapSize + mapOffset <= alloc.fOffset + alloc.fSize);
+    void* mapPtr;
+    VkResult err = GR_VK_CALL(gpu->vkInterface(), MapMemory(gpu->device(),
+                                                            alloc.fMemory,
+                                                            mapOffset,
+                                                            mapSize,
+                                                            0,
+                                                            &mapPtr));
+    mapPtr = reinterpret_cast<char*>(mapPtr) + offsetDiff;
+    if (err) {
         return false;
     }
-    mapPtr = reinterpret_cast<char*>(mapPtr) + offset;
 
     if (srcData) {
         // If there is no padding on dst we can do a single memcopy.
@@ -1155,8 +1192,8 @@ bool copy_testing_data(GrVkGpu* gpu, const void* srcData, const GrVkAlloc& alloc
             }
         }
     }
-    GrVkMemory::FlushMappedAlloc(gpu, alloc, offset, size);
-    GrVkMemory::UnmapAlloc(gpu, alloc);
+    GrVkMemory::FlushMappedAlloc(gpu, alloc, mapOffset, mapSize);
+    GR_VK_CALL(gpu->vkInterface(), UnmapMemory(gpu->device(), alloc.fMemory));
     return true;
 }
 
@@ -1980,7 +2017,7 @@ bool GrVkGpu::onReadPixels(GrSurface* surface, GrSurfaceOrigin origin, int left,
     this->submitCommandBuffer(kForce_SyncQueue);
     void* mappedMemory = transferBuffer->map();
     const GrVkAlloc& transAlloc = transferBuffer->alloc();
-    GrVkMemory::InvalidateMappedAlloc(this, transAlloc, 0, transAlloc.fSize);
+    GrVkMemory::InvalidateMappedAlloc(this, transAlloc, transAlloc.fOffset, VK_WHOLE_SIZE);
 
     if (copyFromOrigin) {
         uint32_t skipRows = region.imageExtent.height - height;
diff --git a/src/gpu/vk/GrVkGpu.h b/src/gpu/vk/GrVkGpu.h
index a44ea7230f..7bdfbeaab3 100644
--- a/src/gpu/vk/GrVkGpu.h
+++ b/src/gpu/vk/GrVkGpu.h
@@ -23,7 +23,6 @@
 class GrPipeline;
 
 class GrVkBufferImpl;
-class GrVkMemoryAllocator;
 class GrVkPipeline;
 class GrVkPipelineState;
 class GrVkPrimaryCommandBuffer;
@@ -47,8 +46,6 @@ public:
     const GrVkInterface* vkInterface() const { return fBackendContext->fInterface.get(); }
     const GrVkCaps& vkCaps() const { return *fVkCaps; }
 
-    GrVkMemoryAllocator* memoryAllocator() const { return fMemoryAllocator.get(); }
-
     VkDevice device() const { return fDevice; }
     VkQueue  queue() const { return fQueue; }
     VkCommandPool cmdPool() const { return fCmdPool; }
@@ -143,6 +140,28 @@ public:
                     VkDeviceSize dstOffset, VkDeviceSize size);
     bool updateBuffer(GrVkBuffer* buffer, const void* src, VkDeviceSize offset, VkDeviceSize size);
 
+    // Heaps
+    enum Heap {
+        kLinearImage_Heap = 0,
+        // We separate out small (i.e., <= 16K) images to reduce fragmentation
+        // in the main heap.
+        kOptimalImage_Heap,
+        kSmallOptimalImage_Heap,
+        // We have separate vertex and image heaps, because it's possible that
+        // a given Vulkan driver may allocate them separately.
+        kVertexBuffer_Heap,
+        kIndexBuffer_Heap,
+        kUniformBuffer_Heap,
+        kTexelBuffer_Heap,
+        kCopyReadBuffer_Heap,
+        kCopyWriteBuffer_Heap,
+
+        kLastHeap = kCopyWriteBuffer_Heap
+    };
+    static const int kHeapCount = kLastHeap + 1;
+
+    GrVkHeap* getHeap(Heap heap) const { return fHeaps[heap].get(); }
+
 private:
     GrVkGpu(GrContext*, const GrContextOptions&, sk_sp<const GrVkBackendContext> backendContext);
 
@@ -232,7 +251,6 @@ private:
 #endif
 
     sk_sp<const GrVkBackendContext> fBackendContext;
-    sk_sp<GrVkMemoryAllocator>      fMemoryAllocator;
     sk_sp<GrVkCaps>                 fVkCaps;
 
     // These Vulkan objects are provided by the client, and also stored in fBackendContext.
@@ -252,6 +270,8 @@ private:
     VkPhysicalDeviceProperties                   fPhysDevProps;
     VkPhysicalDeviceMemoryProperties             fPhysDevMemProps;
 
+    std::unique_ptr<GrVkHeap>                    fHeaps[kHeapCount];
+
     GrVkCopyManager                              fCopyManager;
 
 #ifdef SK_ENABLE_VK_LAYERS
diff --git a/src/gpu/vk/GrVkMemory.cpp b/src/gpu/vk/GrVkMemory.cpp
index f999c26546..4f619a3ef3 100644
--- a/src/gpu/vk/GrVkMemory.cpp
+++ b/src/gpu/vk/GrVkMemory.cpp
@@ -9,26 +9,49 @@
 
 #include "GrVkGpu.h"
 #include "GrVkUtil.h"
-#include "vk/GrVkMemoryAllocator.h"
 
-using AllocationPropertyFlags = GrVkMemoryAllocator::AllocationPropertyFlags;
-using BufferUsage = GrVkMemoryAllocator::BufferUsage;
+#ifdef SK_DEBUG
+// for simple tracking of how much we're using in each heap
+// last counter is for non-subheap allocations
+VkDeviceSize gHeapUsage[VK_MAX_MEMORY_HEAPS+1] = { 0 };
+#endif
 
-static BufferUsage get_buffer_usage(GrVkBuffer::Type type, bool dynamic) {
-    switch (type) {
-        case GrVkBuffer::kVertex_Type: // fall through
-        case GrVkBuffer::kIndex_Type: // fall through
-        case GrVkBuffer::kTexel_Type:
-            return dynamic ? BufferUsage::kCpuWritesGpuReads : BufferUsage::kGpuOnly;
-        case GrVkBuffer::kUniform_Type:
-            SkASSERT(dynamic);
-            return BufferUsage::kCpuWritesGpuReads;
-        case GrVkBuffer::kCopyRead_Type: // fall through
-        case GrVkBuffer::kCopyWrite_Type:
-            return BufferUsage::kCpuOnly;
+static bool get_valid_memory_type_index(const VkPhysicalDeviceMemoryProperties& physDevMemProps,
+                                        uint32_t typeBits,
+                                        VkMemoryPropertyFlags requestedMemFlags,
+                                        uint32_t* typeIndex,
+                                        uint32_t* heapIndex) {
+    for (uint32_t i = 0; i < physDevMemProps.memoryTypeCount; ++i) {
+        if (typeBits & (1 << i)) {
+            uint32_t supportedFlags = physDevMemProps.memoryTypes[i].propertyFlags &
+                                      requestedMemFlags;
+            if (supportedFlags == requestedMemFlags) {
+                *typeIndex = i;
+                *heapIndex = physDevMemProps.memoryTypes[i].heapIndex;
+                return true;
+            }
+        }
     }
-    SK_ABORT("Invalid GrVkBuffer::Type");
-    return BufferUsage::kCpuOnly; // Just returning an arbitrary value.
+    return false;
+}
+
+static GrVkGpu::Heap buffer_type_to_heap(GrVkBuffer::Type type) {
+    const GrVkGpu::Heap kBufferToHeap[]{
+        GrVkGpu::kVertexBuffer_Heap,
+        GrVkGpu::kIndexBuffer_Heap,
+        GrVkGpu::kUniformBuffer_Heap,
+        GrVkGpu::kTexelBuffer_Heap,
+        GrVkGpu::kCopyReadBuffer_Heap,
+        GrVkGpu::kCopyWriteBuffer_Heap,
+    };
+    GR_STATIC_ASSERT(0 == GrVkBuffer::kVertex_Type);
+    GR_STATIC_ASSERT(1 == GrVkBuffer::kIndex_Type);
+    GR_STATIC_ASSERT(2 == GrVkBuffer::kUniform_Type);
+    GR_STATIC_ASSERT(3 == GrVkBuffer::kTexel_Type);
+    GR_STATIC_ASSERT(4 == GrVkBuffer::kCopyRead_Type);
+    GR_STATIC_ASSERT(5 == GrVkBuffer::kCopyWrite_Type);
+
+    return kBufferToHeap[type];
 }
 
 bool GrVkMemory::AllocAndBindBufferMemory(const GrVkGpu* gpu,
@@ -36,23 +59,68 @@ bool GrVkMemory::AllocAndBindBufferMemory(const GrVkGpu* gpu,
                                           GrVkBuffer::Type type,
                                           bool dynamic,
                                           GrVkAlloc* alloc) {
-    GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-    GrVkBackendMemory memory = 0;
+    const GrVkInterface* iface = gpu->vkInterface();
+    VkDevice device = gpu->device();
 
-    GrVkMemoryAllocator::BufferUsage usage = get_buffer_usage(type, dynamic);
+    VkMemoryRequirements memReqs;
+    GR_VK_CALL(iface, GetBufferMemoryRequirements(device, buffer, &memReqs));
 
-    if (!allocator->allocateMemoryForBuffer(buffer, usage, AllocationPropertyFlags::kNone,
-                                            &memory)) {
-        return false;
+    uint32_t typeIndex = 0;
+    uint32_t heapIndex = 0;
+    const VkPhysicalDeviceMemoryProperties& phDevMemProps = gpu->physicalDeviceMemoryProperties();
+    const VkPhysicalDeviceProperties& phDevProps = gpu->physicalDeviceProperties();
+    if (dynamic) {
+        // try to get cached and ideally non-coherent memory first
+        if (!get_valid_memory_type_index(phDevMemProps,
+                                         memReqs.memoryTypeBits,
+                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                         VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+                                         &typeIndex,
+                                         &heapIndex)) {
+            // some sort of host-visible memory type should always be available for dynamic buffers
+            SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
+                                                         memReqs.memoryTypeBits,
+                                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+                                                         &typeIndex,
+                                                         &heapIndex));
+        }
+
+        VkMemoryPropertyFlags mpf = phDevMemProps.memoryTypes[typeIndex].propertyFlags;
+        alloc->fFlags = mpf & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT ? 0x0
+                                                                   : GrVkAlloc::kNoncoherent_Flag;
+        if (SkToBool(alloc->fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+            SkASSERT(SkIsPow2(memReqs.alignment));
+            SkASSERT(SkIsPow2(phDevProps.limits.nonCoherentAtomSize));
+            memReqs.alignment = SkTMax(memReqs.alignment, phDevProps.limits.nonCoherentAtomSize);
+        }
+    } else {
+        // device-local memory should always be available for static buffers
+        SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
+                                                     memReqs.memoryTypeBits,
+                                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                                                     &typeIndex,
+                                                     &heapIndex));
+        alloc->fFlags = 0x0;
+    }
+
+    GrVkHeap* heap = gpu->getHeap(buffer_type_to_heap(type));
+
+    if (!heap->alloc(memReqs.size, memReqs.alignment, typeIndex, heapIndex, alloc)) {
+        // if static, try to allocate from non-host-visible non-device-local memory instead
+        if (dynamic ||
+            !get_valid_memory_type_index(phDevMemProps, memReqs.memoryTypeBits,
+                                         0, &typeIndex, &heapIndex) ||
+            !heap->alloc(memReqs.size, memReqs.alignment, typeIndex, heapIndex, alloc)) {
+            SkDebugf("Failed to alloc buffer\n");
+            return false;
+        }
     }
-    allocator->getAllocInfo(memory, alloc);
 
     // Bind buffer
-    VkResult err = GR_VK_CALL(gpu->vkInterface(), BindBufferMemory(gpu->device(), buffer,
-                                                                   alloc->fMemory,
-                                                                   alloc->fOffset));
+    VkResult err = GR_VK_CALL(iface, BindBufferMemory(device, buffer,
+                                                      alloc->fMemory, alloc->fOffset));
     if (err) {
-        FreeBufferMemory(gpu, type, *alloc);
+        SkASSERT_RELEASE(heap->free(*alloc));
         return false;
     }
 
@@ -61,152 +129,503 @@ bool GrVkMemory::AllocAndBindBufferMemory(const GrVkGpu* gpu,
 
 void GrVkMemory::FreeBufferMemory(const GrVkGpu* gpu, GrVkBuffer::Type type,
                                   const GrVkAlloc& alloc) {
-    if (alloc.fBackendMemory) {
-        GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-        allocator->freeMemory(alloc.fBackendMemory);
-    } else {
-        GR_VK_CALL(gpu->vkInterface(), FreeMemory(gpu->device(), alloc.fMemory, nullptr));
-    }
+
+    GrVkHeap* heap = gpu->getHeap(buffer_type_to_heap(type));
+    SkASSERT_RELEASE(heap->free(alloc));
 }
 
+// for debugging
+static uint64_t gTotalImageMemory = 0;
+static uint64_t gTotalImageMemoryFullPage = 0;
+
 const VkDeviceSize kMaxSmallImageSize = 16 * 1024;
+const VkDeviceSize kMinVulkanPageSize = 16 * 1024;
+
+static VkDeviceSize align_size(VkDeviceSize size, VkDeviceSize alignment) {
+    return (size + alignment - 1) & ~(alignment - 1);
+}
 
 bool GrVkMemory::AllocAndBindImageMemory(const GrVkGpu* gpu,
                                          VkImage image,
                                          bool linearTiling,
                                          GrVkAlloc* alloc) {
-    SkASSERT(!linearTiling);
-    GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-    GrVkBackendMemory memory = 0;
+    const GrVkInterface* iface = gpu->vkInterface();
+    VkDevice device = gpu->device();
 
     VkMemoryRequirements memReqs;
-    GR_VK_CALL(gpu->vkInterface(), GetImageMemoryRequirements(gpu->device(), image, &memReqs));
+    GR_VK_CALL(iface, GetImageMemoryRequirements(device, image, &memReqs));
 
-    AllocationPropertyFlags propFlags;
-    if (memReqs.size <= kMaxSmallImageSize) {
-        propFlags = AllocationPropertyFlags::kNone;
+    uint32_t typeIndex = 0;
+    uint32_t heapIndex = 0;
+    GrVkHeap* heap;
+    const VkPhysicalDeviceMemoryProperties& phDevMemProps = gpu->physicalDeviceMemoryProperties();
+    const VkPhysicalDeviceProperties& phDevProps = gpu->physicalDeviceProperties();
+    if (linearTiling) {
+        VkMemoryPropertyFlags desiredMemProps = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                                VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        if (!get_valid_memory_type_index(phDevMemProps,
+                                         memReqs.memoryTypeBits,
+                                         desiredMemProps,
+                                         &typeIndex,
+                                         &heapIndex)) {
+            // some sort of host-visible memory type should always be available
+            SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
+                                                         memReqs.memoryTypeBits,
+                                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+                                                         &typeIndex,
+                                                         &heapIndex));
+        }
+        heap = gpu->getHeap(GrVkGpu::kLinearImage_Heap);
+        VkMemoryPropertyFlags mpf = phDevMemProps.memoryTypes[typeIndex].propertyFlags;
+        alloc->fFlags = mpf & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT ? 0x0
+                                                                   : GrVkAlloc::kNoncoherent_Flag;
+        if (SkToBool(alloc->fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+            SkASSERT(SkIsPow2(memReqs.alignment));
+            SkASSERT(SkIsPow2(phDevProps.limits.nonCoherentAtomSize));
+            memReqs.alignment = SkTMax(memReqs.alignment, phDevProps.limits.nonCoherentAtomSize);
+        }
     } else {
-        propFlags = AllocationPropertyFlags::kDedicatedAllocation;
+        // this memory type should always be available
+        SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
+                                                     memReqs.memoryTypeBits,
+                                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                                                     &typeIndex,
+                                                     &heapIndex));
+        if (memReqs.size <= kMaxSmallImageSize) {
+            heap = gpu->getHeap(GrVkGpu::kSmallOptimalImage_Heap);
+        } else {
+            heap = gpu->getHeap(GrVkGpu::kOptimalImage_Heap);
+        }
+        alloc->fFlags = 0x0;
     }
 
-    if (!allocator->allocateMemoryForImage(image, AllocationPropertyFlags::kDedicatedAllocation,
-                                           &memory)) {
-        return false;
+    if (!heap->alloc(memReqs.size, memReqs.alignment, typeIndex, heapIndex, alloc)) {
+        // if optimal, try to allocate from non-host-visible non-device-local memory instead
+        if (linearTiling ||
+            !get_valid_memory_type_index(phDevMemProps, memReqs.memoryTypeBits,
+                                         0, &typeIndex, &heapIndex) ||
+            !heap->alloc(memReqs.size, memReqs.alignment, typeIndex, heapIndex, alloc)) {
+            SkDebugf("Failed to alloc image\n");
+            return false;
+        }
     }
-    allocator->getAllocInfo(memory, alloc);
 
-    // Bind buffer
-    VkResult err = GR_VK_CALL(gpu->vkInterface(), BindImageMemory(gpu->device(), image,
-                                                                  alloc->fMemory, alloc->fOffset));
+    // Bind image
+    VkResult err = GR_VK_CALL(iface, BindImageMemory(device, image,
+                              alloc->fMemory, alloc->fOffset));
     if (err) {
-        FreeImageMemory(gpu, linearTiling, *alloc);
+        SkASSERT_RELEASE(heap->free(*alloc));
         return false;
     }
 
+    gTotalImageMemory += alloc->fSize;
+
+    VkDeviceSize pageAlignedSize = align_size(alloc->fSize, kMinVulkanPageSize);
+    gTotalImageMemoryFullPage += pageAlignedSize;
+
     return true;
 }
 
 void GrVkMemory::FreeImageMemory(const GrVkGpu* gpu, bool linearTiling,
                                  const GrVkAlloc& alloc) {
-    if (alloc.fBackendMemory) {
-        GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-        allocator->freeMemory(alloc.fBackendMemory);
+    GrVkHeap* heap;
+    if (linearTiling) {
+        heap = gpu->getHeap(GrVkGpu::kLinearImage_Heap);
+    } else if (alloc.fSize <= kMaxSmallImageSize) {
+        heap = gpu->getHeap(GrVkGpu::kSmallOptimalImage_Heap);
     } else {
+        heap = gpu->getHeap(GrVkGpu::kOptimalImage_Heap);
+    }
+    if (!heap->free(alloc)) {
+        // must be an adopted allocation
         GR_VK_CALL(gpu->vkInterface(), FreeMemory(gpu->device(), alloc.fMemory, nullptr));
+    } else {
+        gTotalImageMemory -= alloc.fSize;
+        VkDeviceSize pageAlignedSize = align_size(alloc.fSize, kMinVulkanPageSize);
+        gTotalImageMemoryFullPage -= pageAlignedSize;
     }
 }
 
-void* GrVkMemory::MapAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc) {
-    SkASSERT(GrVkAlloc::kMappable_Flag & alloc.fFlags);
-#ifdef SK_DEBUG
+void GrVkMemory::FlushMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc, VkDeviceSize offset,
+                                  VkDeviceSize size) {
     if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
+#ifdef SK_DEBUG
+        SkASSERT(offset >= alloc.fOffset);
         VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
-        SkASSERT(0 == (alloc.fOffset & (alignment-1)));
-        SkASSERT(0 == (alloc.fSize & (alignment-1)));
+        SkASSERT(0 == (offset & (alignment-1)));
+        if (size != VK_WHOLE_SIZE) {
+            SkASSERT(size > 0);
+            SkASSERT(0 == (size & (alignment-1)) ||
+                     (offset + size) == (alloc.fOffset + alloc.fSize));
+            SkASSERT(offset + size <= alloc.fOffset + alloc.fSize);
+        }
+#endif
+
+        VkMappedMemoryRange mappedMemoryRange;
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = alloc.fMemory;
+        mappedMemoryRange.offset = offset;
+        mappedMemoryRange.size = size;
+        GR_VK_CALL(gpu->vkInterface(), FlushMappedMemoryRanges(gpu->device(),
+                                                               1, &mappedMemoryRange));
     }
+}
+
+void GrVkMemory::InvalidateMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc,
+                                       VkDeviceSize offset, VkDeviceSize size) {
+    if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
+#ifdef SK_DEBUG
+        SkASSERT(offset >= alloc.fOffset);
+        VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
+        SkASSERT(0 == (offset & (alignment-1)));
+        if (size != VK_WHOLE_SIZE) {
+            SkASSERT(size > 0);
+            SkASSERT(0 == (size & (alignment-1)) ||
+                     (offset + size) == (alloc.fOffset + alloc.fSize));
+            SkASSERT(offset + size <= alloc.fOffset + alloc.fSize);
+        }
 #endif
-    if (alloc.fBackendMemory) {
-        GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-        return allocator->mapMemory(alloc.fBackendMemory);
+
+        VkMappedMemoryRange mappedMemoryRange;
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = alloc.fMemory;
+        mappedMemoryRange.offset = offset;
+        mappedMemoryRange.size = size;
+        GR_VK_CALL(gpu->vkInterface(), InvalidateMappedMemoryRanges(gpu->device(),
+                                                               1, &mappedMemoryRange));
     }
+}
 
-    void* mapPtr;
-    VkResult err = GR_VK_CALL(gpu->vkInterface(), MapMemory(gpu->device(), alloc.fMemory,
-                                                            alloc.fOffset,
-                                                            alloc.fSize, 0, &mapPtr));
-    if (err) {
-        mapPtr = nullptr;
+bool GrVkFreeListAlloc::alloc(VkDeviceSize requestedSize,
+                              VkDeviceSize* allocOffset, VkDeviceSize* allocSize) {
+    VkDeviceSize alignedSize = align_size(requestedSize, fAlignment);
+
+    // find the smallest block big enough for our allocation
+    FreeList::Iter iter = fFreeList.headIter();
+    FreeList::Iter bestFitIter;
+    VkDeviceSize   bestFitSize = fSize + 1;
+    VkDeviceSize   secondLargestSize = 0;
+    VkDeviceSize   secondLargestOffset = 0;
+    while (iter.get()) {
+        Block* block = iter.get();
+        // need to adjust size to match desired alignment
+        SkASSERT(align_size(block->fOffset, fAlignment) - block->fOffset == 0);
+        if (block->fSize >= alignedSize && block->fSize < bestFitSize) {
+            bestFitIter = iter;
+            bestFitSize = block->fSize;
+        }
+        if (secondLargestSize < block->fSize && block->fOffset != fLargestBlockOffset) {
+            secondLargestSize = block->fSize;
+            secondLargestOffset = block->fOffset;
+        }
+        iter.next();
+    }
+    SkASSERT(secondLargestSize <= fLargestBlockSize);
+
+    Block* bestFit = bestFitIter.get();
+    if (bestFit) {
+        SkASSERT(align_size(bestFit->fOffset, fAlignment) == bestFit->fOffset);
+        *allocOffset = bestFit->fOffset;
+        *allocSize = alignedSize;
+        // adjust or remove current block
+        VkDeviceSize originalBestFitOffset = bestFit->fOffset;
+        if (bestFit->fSize > alignedSize) {
+            bestFit->fOffset += alignedSize;
+            bestFit->fSize -= alignedSize;
+            if (fLargestBlockOffset == originalBestFitOffset) {
+                if (bestFit->fSize >= secondLargestSize) {
+                    fLargestBlockSize = bestFit->fSize;
+                    fLargestBlockOffset = bestFit->fOffset;
+                } else {
+                    fLargestBlockSize = secondLargestSize;
+                    fLargestBlockOffset = secondLargestOffset;
+                }
+            }
+#ifdef SK_DEBUG
+            VkDeviceSize largestSize = 0;
+            iter = fFreeList.headIter();
+            while (iter.get()) {
+                Block* block = iter.get();
+                if (largestSize < block->fSize) {
+                    largestSize = block->fSize;
+                }
+                iter.next();
+            }
+            SkASSERT(largestSize == fLargestBlockSize);
+#endif
+        } else {
+            SkASSERT(bestFit->fSize == alignedSize);
+            if (fLargestBlockOffset == originalBestFitOffset) {
+                fLargestBlockSize = secondLargestSize;
+                fLargestBlockOffset = secondLargestOffset;
+            }
+            fFreeList.remove(bestFit);
+#ifdef SK_DEBUG
+            VkDeviceSize largestSize = 0;
+            iter = fFreeList.headIter();
+            while (iter.get()) {
+                Block* block = iter.get();
+                if (largestSize < block->fSize) {
+                    largestSize = block->fSize;
+                }
+                iter.next();
+            }
+            SkASSERT(largestSize == fLargestBlockSize);
+#endif
+        }
+        fFreeSize -= alignedSize;
+        SkASSERT(*allocSize > 0);
+
+        return true;
     }
-    return mapPtr;
+
+    SkDebugf("Can't allocate %d bytes, %d bytes available, largest free block %d\n", alignedSize, fFreeSize, fLargestBlockSize);
+
+    return false;
 }
 
-void GrVkMemory::UnmapAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc) {
-    if (alloc.fBackendMemory) {
-        GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-        allocator->unmapMemory(alloc.fBackendMemory);
+void GrVkFreeListAlloc::free(VkDeviceSize allocOffset, VkDeviceSize allocSize) {
+    // find the block right after this allocation
+    FreeList::Iter iter = fFreeList.headIter();
+    FreeList::Iter prev;
+    while (iter.get() && iter.get()->fOffset < allocOffset) {
+        prev = iter;
+        iter.next();
+    }
+    // we have four cases:
+    // we exactly follow the previous one
+    Block* block;
+    if (prev.get() && prev.get()->fOffset + prev.get()->fSize == allocOffset) {
+        block = prev.get();
+        block->fSize += allocSize;
+        if (block->fOffset == fLargestBlockOffset) {
+            fLargestBlockSize = block->fSize;
+        }
+        // and additionally we may exactly precede the next one
+        if (iter.get() && iter.get()->fOffset == allocOffset + allocSize) {
+            block->fSize += iter.get()->fSize;
+            if (iter.get()->fOffset == fLargestBlockOffset) {
+                fLargestBlockOffset = block->fOffset;
+                fLargestBlockSize = block->fSize;
+            }
+            fFreeList.remove(iter.get());
+        }
+    // or we only exactly proceed the next one
+    } else if (iter.get() && iter.get()->fOffset == allocOffset + allocSize) {
+        block = iter.get();
+        block->fSize += allocSize;
+        if (block->fOffset == fLargestBlockOffset) {
+            fLargestBlockOffset = allocOffset;
+            fLargestBlockSize = block->fSize;
+        }
+        block->fOffset = allocOffset;
+    // or we fall somewhere in between, with gaps
     } else {
-        GR_VK_CALL(gpu->vkInterface(), UnmapMemory(gpu->device(), alloc.fMemory));
+        block = fFreeList.addBefore(iter);
+        block->fOffset = allocOffset;
+        block->fSize = allocSize;
+    }
+    fFreeSize += allocSize;
+    if (block->fSize > fLargestBlockSize) {
+        fLargestBlockSize = block->fSize;
+        fLargestBlockOffset = block->fOffset;
+    }
+
+#ifdef SK_DEBUG
+    VkDeviceSize   largestSize = 0;
+    iter = fFreeList.headIter();
+    while (iter.get()) {
+        Block* block = iter.get();
+        if (largestSize < block->fSize) {
+            largestSize = block->fSize;
+        }
+        iter.next();
     }
+    SkASSERT(fLargestBlockSize == largestSize);
+#endif
 }
 
-void GrVkMemory::GetNonCoherentMappedMemoryRange(const GrVkAlloc& alloc, VkDeviceSize offset,
-                                                 VkDeviceSize size, VkDeviceSize alignment,
-                                                 VkMappedMemoryRange* range) {
-    SkASSERT(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag);
-    offset = offset + alloc.fOffset;
-    VkDeviceSize offsetDiff = offset & (alignment -1);
-    offset = offset - offsetDiff;
-    size = (size + alignment - 1) & ~(alignment - 1);
+GrVkSubHeap::GrVkSubHeap(const GrVkGpu* gpu, uint32_t memoryTypeIndex, uint32_t heapIndex,
+                         VkDeviceSize size, VkDeviceSize alignment)
+    : INHERITED(size, alignment)
+    , fGpu(gpu)
 #ifdef SK_DEBUG
-    SkASSERT(offset >= alloc.fOffset);
-    SkASSERT(offset + size <= alloc.fOffset + alloc.fSize);
-    SkASSERT(0 == (offset & (alignment-1)));
-    SkASSERT(size > 0);
-    SkASSERT(0 == (size & (alignment-1)));
+    , fHeapIndex(heapIndex)
 #endif
+    , fMemoryTypeIndex(memoryTypeIndex) {
+
+    VkMemoryAllocateInfo allocInfo = {
+        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,      // sType
+        nullptr,                                     // pNext
+        size,                                        // allocationSize
+        memoryTypeIndex,                             // memoryTypeIndex
+    };
 
-    memset(range, 0, sizeof(VkMappedMemoryRange));
-    range->sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range->memory = alloc.fMemory;
-    range->offset = offset;
-    range->size = size;
+    VkResult err = GR_VK_CALL(gpu->vkInterface(), AllocateMemory(gpu->device(),
+                                                                 &allocInfo,
+                                                                 nullptr,
+                                                                 &fAlloc));
+    if (VK_SUCCESS != err) {
+        this->reset();
+    }
+#ifdef SK_DEBUG
+    else {
+        gHeapUsage[heapIndex] += size;
+    }
+#endif
 }
 
-void GrVkMemory::FlushMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc, VkDeviceSize offset,
-                                  VkDeviceSize size) {
-    if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
-        SkASSERT(offset == 0);
-        SkASSERT(size <= alloc.fSize);
-        if (alloc.fBackendMemory) {
-            GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-            allocator->flushMappedMemory(alloc.fBackendMemory, offset, size);
-        } else {
-            VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
-            VkMappedMemoryRange mappedMemoryRange;
-            GrVkMemory::GetNonCoherentMappedMemoryRange(alloc, offset, size, alignment,
-                                                        &mappedMemoryRange);
-            GR_VK_CALL(gpu->vkInterface(), FlushMappedMemoryRanges(gpu->device(), 1,
-                                                                   &mappedMemoryRange));
+GrVkSubHeap::~GrVkSubHeap() {
+    const GrVkInterface* iface = fGpu->vkInterface();
+    GR_VK_CALL(iface, FreeMemory(fGpu->device(), fAlloc, nullptr));
+#ifdef SK_DEBUG
+    gHeapUsage[fHeapIndex] -= fSize;
+#endif
+}
+
+bool GrVkSubHeap::alloc(VkDeviceSize size, GrVkAlloc* alloc) {
+    alloc->fMemory = fAlloc;
+    return INHERITED::alloc(size, &alloc->fOffset, &alloc->fSize);
+}
+
+void GrVkSubHeap::free(const GrVkAlloc& alloc) {
+    SkASSERT(alloc.fMemory == fAlloc);
+
+    INHERITED::free(alloc.fOffset, alloc.fSize);
+}
+
+bool GrVkHeap::subAlloc(VkDeviceSize size, VkDeviceSize alignment,
+                        uint32_t memoryTypeIndex, uint32_t heapIndex, GrVkAlloc* alloc) {
+    VkDeviceSize alignedSize = align_size(size, alignment);
+
+    // if requested is larger than our subheap allocation, just alloc directly
+    if (alignedSize > fSubHeapSize) {
+        VkMemoryAllocateInfo allocInfo = {
+            VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,      // sType
+            nullptr,                                     // pNext
+            alignedSize,                                 // allocationSize
+            memoryTypeIndex,                             // memoryTypeIndex
+        };
+
+        VkResult err = GR_VK_CALL(fGpu->vkInterface(), AllocateMemory(fGpu->device(),
+                                                                      &allocInfo,
+                                                                      nullptr,
+                                                                      &alloc->fMemory));
+        if (VK_SUCCESS != err) {
+            return false;
         }
+        alloc->fOffset = 0;
+        alloc->fSize = alignedSize;
+        alloc->fUsesSystemHeap = true;
+#ifdef SK_DEBUG
+        gHeapUsage[VK_MAX_MEMORY_HEAPS] += alignedSize;
+#endif
+
+        return true;
     }
+
+    // first try to find a subheap that fits our allocation request
+    int bestFitIndex = -1;
+    VkDeviceSize bestFitSize = 0x7FFFFFFF;
+    for (auto i = 0; i < fSubHeaps.count(); ++i) {
+        if (fSubHeaps[i]->memoryTypeIndex() == memoryTypeIndex &&
+            fSubHeaps[i]->alignment() == alignment) {
+            VkDeviceSize heapSize = fSubHeaps[i]->largestBlockSize();
+            if (heapSize >= alignedSize && heapSize < bestFitSize) {
+                bestFitIndex = i;
+                bestFitSize = heapSize;
+            }
+        }
+    }
+
+    if (bestFitIndex >= 0) {
+        SkASSERT(fSubHeaps[bestFitIndex]->alignment() == alignment);
+        if (fSubHeaps[bestFitIndex]->alloc(size, alloc)) {
+            fUsedSize += alloc->fSize;
+            return true;
+        }
+        return false;
+    }
+
+    // need to allocate a new subheap
+    std::unique_ptr<GrVkSubHeap>& subHeap = fSubHeaps.push_back();
+    subHeap.reset(new GrVkSubHeap(fGpu, memoryTypeIndex, heapIndex, fSubHeapSize, alignment));
+    // try to recover from failed allocation by only allocating what we need
+    if (subHeap->size() == 0) {
+        VkDeviceSize alignedSize = align_size(size, alignment);
+        subHeap.reset(new GrVkSubHeap(fGpu, memoryTypeIndex, heapIndex, alignedSize, alignment));
+        if (subHeap->size() == 0) {
+            return false;
+        }
+    }
+    fAllocSize += fSubHeapSize;
+    if (subHeap->alloc(size, alloc)) {
+        fUsedSize += alloc->fSize;
+        return true;
+    }
+
+    return false;
 }
 
-void GrVkMemory::InvalidateMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc,
-                                       VkDeviceSize offset, VkDeviceSize size) {
-    if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
-        SkASSERT(offset == 0);
-        SkASSERT(size <= alloc.fSize);
-        if (alloc.fBackendMemory) {
-            GrVkMemoryAllocator* allocator = gpu->memoryAllocator();
-            allocator->invalidateMappedMemory(alloc.fBackendMemory, offset, size);
-        } else {
-            VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
-            VkMappedMemoryRange mappedMemoryRange;
-            GrVkMemory::GetNonCoherentMappedMemoryRange(alloc, offset, size, alignment,
-                                                        &mappedMemoryRange);
-            GR_VK_CALL(gpu->vkInterface(), InvalidateMappedMemoryRanges(gpu->device(), 1,
-                                                                        &mappedMemoryRange));
+bool GrVkHeap::singleAlloc(VkDeviceSize size, VkDeviceSize alignment,
+                           uint32_t memoryTypeIndex, uint32_t heapIndex, GrVkAlloc* alloc) {
+    VkDeviceSize alignedSize = align_size(size, alignment);
+
+    // first try to find an unallocated subheap that fits our allocation request
+    int bestFitIndex = -1;
+    VkDeviceSize bestFitSize = 0x7FFFFFFF;
+    for (auto i = 0; i < fSubHeaps.count(); ++i) {
+        if (fSubHeaps[i]->memoryTypeIndex() == memoryTypeIndex &&
+            fSubHeaps[i]->alignment() == alignment &&
+            fSubHeaps[i]->unallocated()) {
+            VkDeviceSize heapSize = fSubHeaps[i]->size();
+            if (heapSize >= alignedSize && heapSize < bestFitSize) {
+                bestFitIndex = i;
+                bestFitSize = heapSize;
+            }
+        }
+    }
+
+    if (bestFitIndex >= 0) {
+        SkASSERT(fSubHeaps[bestFitIndex]->alignment() == alignment);
+        if (fSubHeaps[bestFitIndex]->alloc(size, alloc)) {
+            fUsedSize += alloc->fSize;
+            return true;
+        }
+        return false;
+    }
+
+    // need to allocate a new subheap
+    std::unique_ptr<GrVkSubHeap>& subHeap = fSubHeaps.push_back();
+    subHeap.reset(new GrVkSubHeap(fGpu, memoryTypeIndex, heapIndex, alignedSize, alignment));
+    fAllocSize += alignedSize;
+    if (subHeap->alloc(size, alloc)) {
+        fUsedSize += alloc->fSize;
+        return true;
+    }
+
+    return false;
+}
+
+bool GrVkHeap::free(const GrVkAlloc& alloc) {
+    // a size of 0 means we're using the system heap
+    if (alloc.fUsesSystemHeap) {
+        const GrVkInterface* iface = fGpu->vkInterface();
+        GR_VK_CALL(iface, FreeMemory(fGpu->device(), alloc.fMemory, nullptr));
+        return true;
+    }
+
+    for (auto i = 0; i < fSubHeaps.count(); ++i) {
+        if (fSubHeaps[i]->memory() == alloc.fMemory) {
+            fSubHeaps[i]->free(alloc);
+            fUsedSize -= alloc.fSize;
+            return true;
         }
     }
+
+    return false;
 }
 
+
diff --git a/src/gpu/vk/GrVkMemory.h b/src/gpu/vk/GrVkMemory.h
index 741bdaa8a0..bb6681435f 100644
--- a/src/gpu/vk/GrVkMemory.h
+++ b/src/gpu/vk/GrVkMemory.h
@@ -34,25 +34,133 @@ namespace GrVkMemory {
                                  GrVkAlloc* alloc);
     void FreeImageMemory(const GrVkGpu* gpu, bool linearTiling, const GrVkAlloc& alloc);
 
-    // Maps the entire GrVkAlloc and returns a pointer to the start of the allocation. Underneath
-    // the hood, we may map more than the range of the GrVkAlloc (e.g. the entire VkDeviceMemory),
-    // but the pointer returned will always be to the start of the GrVkAlloc. The caller should also
-    // never assume more than the GrVkAlloc block has been mapped.
-    void* MapAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc);
-    void UnmapAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc);
-
-    // For the Flush and Invalidate calls, the offset should be relative to the GrVkAlloc. Thus this
-    // will often be 0. The client does not need to make sure the offset and size are aligned to the
-    // nonCoherentAtomSize, the internal calls will handle that.
     void FlushMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc, VkDeviceSize offset,
                           VkDeviceSize size);
     void InvalidateMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc, VkDeviceSize offset,
                                VkDeviceSize size);
-
-    // Helper for aligning and setting VkMappedMemoryRange for flushing/invalidating noncoherent
-    // memory.
-    void GetNonCoherentMappedMemoryRange(const GrVkAlloc&, VkDeviceSize offset, VkDeviceSize size,
-                                         VkDeviceSize alignment, VkMappedMemoryRange*);
 }
 
+class GrVkFreeListAlloc {
+public:
+    GrVkFreeListAlloc(VkDeviceSize size, VkDeviceSize alignment)
+        : fSize(size)
+        , fAlignment(alignment)
+        , fFreeSize(size)
+        , fLargestBlockSize(size)
+        , fLargestBlockOffset(0) {
+        Block* block = fFreeList.addToTail();
+        block->fOffset = 0;
+        block->fSize = fSize;
+    }
+    ~GrVkFreeListAlloc() {
+        this->reset();
+    }
+
+    VkDeviceSize size() const { return fSize; }
+    VkDeviceSize alignment() const { return fAlignment; }
+    VkDeviceSize freeSize() const { return fFreeSize; }
+    VkDeviceSize largestBlockSize() const { return fLargestBlockSize; }
+
+    bool unallocated() const { return fSize == fFreeSize; }
+
+protected:
+    bool alloc(VkDeviceSize requestedSize, VkDeviceSize* allocOffset, VkDeviceSize* allocSize);
+    void free(VkDeviceSize allocOffset, VkDeviceSize allocSize);
+
+    void reset() {
+        fSize = 0;
+        fAlignment = 0;
+        fFreeSize = 0;
+        fLargestBlockSize = 0;
+        fFreeList.reset();
+    }
+
+    struct Block {
+        VkDeviceSize fOffset;
+        VkDeviceSize fSize;
+    };
+    typedef SkTLList<Block, 16> FreeList;
+
+    VkDeviceSize   fSize;
+    VkDeviceSize   fAlignment;
+    VkDeviceSize   fFreeSize;
+    VkDeviceSize   fLargestBlockSize;
+    VkDeviceSize   fLargestBlockOffset;
+    FreeList       fFreeList;
+};
+
+class GrVkSubHeap : public GrVkFreeListAlloc {
+public:
+    GrVkSubHeap(const GrVkGpu* gpu, uint32_t memoryTypeIndex, uint32_t heapIndex,
+                VkDeviceSize size, VkDeviceSize alignment);
+    ~GrVkSubHeap();
+
+    uint32_t memoryTypeIndex() const { return fMemoryTypeIndex; }
+    VkDeviceMemory memory() { return fAlloc; }
+
+    bool alloc(VkDeviceSize requestedSize, GrVkAlloc* alloc);
+    void free(const GrVkAlloc& alloc);
+
+private:
+    const GrVkGpu* fGpu;
+#ifdef SK_DEBUG
+    uint32_t       fHeapIndex;
+#endif
+    uint32_t       fMemoryTypeIndex;
+    VkDeviceMemory fAlloc;
+
+    typedef GrVkFreeListAlloc INHERITED;
+};
+
+class GrVkHeap {
+public:
+    enum Strategy {
+        kSubAlloc_Strategy,       // alloc large subheaps and suballoc within them
+        kSingleAlloc_Strategy     // alloc/recycle an individual subheap per object
+    };
+
+    GrVkHeap(const GrVkGpu* gpu, Strategy strategy, VkDeviceSize subHeapSize)
+        : fGpu(gpu)
+        , fSubHeapSize(subHeapSize)
+        , fAllocSize(0)
+        , fUsedSize(0) {
+        if (strategy == kSubAlloc_Strategy) {
+            fAllocFunc = &GrVkHeap::subAlloc;
+        } else {
+            fAllocFunc = &GrVkHeap::singleAlloc;
+        }
+    }
+
+    ~GrVkHeap() {}
+
+    VkDeviceSize allocSize() const { return fAllocSize; }
+    VkDeviceSize usedSize() const { return fUsedSize; }
+
+    bool alloc(VkDeviceSize size, VkDeviceSize alignment, uint32_t memoryTypeIndex,
+               uint32_t heapIndex, GrVkAlloc* alloc) {
+        SkASSERT(size > 0);
+        alloc->fUsesSystemHeap = false;
+        return (*this.*fAllocFunc)(size, alignment, memoryTypeIndex, heapIndex, alloc);
+    }
+    bool free(const GrVkAlloc& alloc);
+
+private:
+    typedef bool (GrVkHeap::*AllocFunc)(VkDeviceSize size, VkDeviceSize alignment,
+                                        uint32_t memoryTypeIndex, uint32_t heapIndex,
+                                        GrVkAlloc* alloc);
+
+    bool subAlloc(VkDeviceSize size, VkDeviceSize alignment,
+                  uint32_t memoryTypeIndex, uint32_t heapIndex,
+                  GrVkAlloc* alloc);
+    bool singleAlloc(VkDeviceSize size, VkDeviceSize alignment,
+                     uint32_t memoryTypeIndex, uint32_t heapIndex,
+                     GrVkAlloc* alloc);
+
+    const GrVkGpu*         fGpu;
+    VkDeviceSize           fSubHeapSize;
+    VkDeviceSize           fAllocSize;
+    VkDeviceSize           fUsedSize;
+    AllocFunc              fAllocFunc;
+    SkTArray<std::unique_ptr<GrVkSubHeap>> fSubHeaps;
+};
 #endif
diff --git a/tests/VkHeapTests.cpp b/tests/VkHeapTests.cpp
new file mode 100644
index 0000000000..67eb045d98
--- /dev/null
+++ b/tests/VkHeapTests.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This is a GPU-backend specific test. It relies on static intializers to work
+
+#include "SkTypes.h"
+
+#if SK_SUPPORT_GPU && defined(SK_VULKAN)
+
+#include "GrContextPriv.h"
+#include "GrContextFactory.h"
+#include "GrTest.h"
+#include "Test.h"
+#include "vk/GrVkGpu.h"
+
+using sk_gpu_test::GrContextFactory;
+
+void subheap_test(skiatest::Reporter* reporter, GrContext* context) {
+    GrVkGpu* gpu = static_cast<GrVkGpu*>(context->contextPriv().getGpu());
+
+    // memtype doesn't matter, we're just testing the suballocation algorithm so we'll use 0
+    GrVkSubHeap heap(gpu, 0, 0, 64 * 1024, 32);
+    GrVkAlloc alloc0, alloc1, alloc2, alloc3;
+    // test full allocation and free
+    REPORTER_ASSERT(reporter, heap.alloc(64 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, alloc0.fOffset == 0);
+    REPORTER_ASSERT(reporter, alloc0.fSize == 64 * 1024);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 0 && heap.largestBlockSize() == 0);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 64*1024 && heap.largestBlockSize() == 64 * 1024);
+
+    // now let's suballoc some memory
+    REPORTER_ASSERT(reporter, heap.alloc(16 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(23 * 1024, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(18 * 1024, &alloc2));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 7 * 1024 && heap.largestBlockSize() == 7 * 1024);
+    // free lone block
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 30 * 1024 && heap.largestBlockSize() == 23 * 1024);
+    // allocate into smallest free block
+    REPORTER_ASSERT(reporter, heap.alloc(6 * 1024, &alloc3));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 24 * 1024 && heap.largestBlockSize() == 23 * 1024);
+    // allocate into exact size free block
+    REPORTER_ASSERT(reporter, heap.alloc(23 * 1024, &alloc1));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 1 * 1024 && heap.largestBlockSize() == 1 * 1024);
+    // free lone block
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 19 * 1024 && heap.largestBlockSize() == 18 * 1024);
+    // free and merge with preceding block and following
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 25 * 1024 && heap.largestBlockSize() == 25 * 1024);
+    // free and merge with following block
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 48 * 1024 && heap.largestBlockSize() == 48 * 1024);
+    // free starting block and merge with following
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 64 * 1024 && heap.largestBlockSize() == 64 * 1024);
+
+    // realloc
+    REPORTER_ASSERT(reporter, heap.alloc(4 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(35 * 1024, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(10 * 1024, &alloc2));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 15 * 1024 && heap.largestBlockSize() == 15 * 1024);
+    // free starting block and merge with following
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 19 * 1024 && heap.largestBlockSize() == 15 * 1024);
+    // free block and merge with preceding
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 54 * 1024 && heap.largestBlockSize() == 39 * 1024);
+    // free block and merge with preceding and following
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 64 * 1024 && heap.largestBlockSize() == 64 * 1024);
+
+    // fragment
+    REPORTER_ASSERT(reporter, heap.alloc(19 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(5 * 1024, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(15 * 1024, &alloc2));
+    REPORTER_ASSERT(reporter, heap.alloc(3 * 1024, &alloc3));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 22 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 41 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 56 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    REPORTER_ASSERT(reporter, !heap.alloc(40 * 1024, &alloc0));
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 59 * 1024 && heap.largestBlockSize() == 40 * 1024);
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 19 * 1024 && heap.largestBlockSize() == 19 * 1024);
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 24 * 1024 && heap.largestBlockSize() == 24 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 64 * 1024 && heap.largestBlockSize() == 64 * 1024);
+
+    // unaligned sizes
+    REPORTER_ASSERT(reporter, heap.alloc(19 * 1024 - 31, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(5 * 1024 - 5, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(15 * 1024 - 19, &alloc2));
+    REPORTER_ASSERT(reporter, heap.alloc(3 * 1024 - 3, &alloc3));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 22 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 41 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 56 * 1024 && heap.largestBlockSize() == 22 * 1024);
+    REPORTER_ASSERT(reporter, !heap.alloc(40 * 1024, &alloc0));
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 59 * 1024 && heap.largestBlockSize() == 40 * 1024);
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, &alloc0));
+    REPORTER_ASSERT(reporter, heap.freeSize() == 19 * 1024 && heap.largestBlockSize() == 19 * 1024);
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 24 * 1024 && heap.largestBlockSize() == 24 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.freeSize() == 64 * 1024 && heap.largestBlockSize() == 64 * 1024);
+}
+
+void suballoc_test(skiatest::Reporter* reporter, GrContext* context) {
+    GrVkGpu* gpu = static_cast<GrVkGpu*>(context->contextPriv().getGpu());
+
+    // memtype/heap index don't matter, we're just testing the allocation algorithm so we'll use 0
+    GrVkHeap heap(gpu, GrVkHeap::kSubAlloc_Strategy, 64 * 1024);
+    GrVkAlloc alloc0, alloc1, alloc2, alloc3;
+    const VkDeviceSize kAlignment = 16;
+    const uint32_t kMemType = 0;
+    const uint32_t kHeapIndex = 0;
+
+    REPORTER_ASSERT(reporter, heap.allocSize() == 0 && heap.usedSize() == 0);
+
+    // fragment allocations so we need to grow heap
+    REPORTER_ASSERT(reporter, heap.alloc(19 * 1024 - 3, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(5 * 1024 - 9, kAlignment, kMemType, kHeapIndex, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(15 * 1024 - 15, kAlignment, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.alloc(3 * 1024 - 6, kAlignment, kMemType, kHeapIndex, &alloc3));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 64 * 1024 && heap.usedSize() == 42 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 64 * 1024 && heap.usedSize() == 23 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 64 * 1024 && heap.usedSize() == 8 * 1024);
+    // we expect the heap to grow here
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 48 * 1024);
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 45 * 1024);
+    // heap should not grow here (first subheap has exactly enough room)
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, kAlignment, kMemType, kHeapIndex, &alloc3));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 85 * 1024);
+    // heap should not grow here (second subheap has room)
+    REPORTER_ASSERT(reporter, heap.alloc(22 * 1024, kAlignment, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 107 * 1024);
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 102 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 62 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 40 * 1024);
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 0 * 1024);
+    // heap should not grow here (allocating more than subheap size)
+    REPORTER_ASSERT(reporter, heap.alloc(128 * 1024, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 0 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 128 * 1024 && heap.usedSize() == 24 * 1024);
+    // heap should alloc a new subheap because the memory type is different
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, kAlignment, kMemType+1, kHeapIndex, &alloc1));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 192 * 1024 && heap.usedSize() == 48 * 1024);
+    // heap should alloc a new subheap because the alignment is different
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, 128, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 256 * 1024 && heap.usedSize() == 72 * 1024);
+    heap.free(alloc2);
+    heap.free(alloc0);
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 256 * 1024 && heap.usedSize() == 0 * 1024);
+}
+
+void singlealloc_test(skiatest::Reporter* reporter, GrContext* context) {
+    GrVkGpu* gpu = static_cast<GrVkGpu*>(context->contextPriv().getGpu());
+
+    // memtype/heap index don't matter, we're just testing the allocation algorithm so we'll use 0
+    GrVkHeap heap(gpu, GrVkHeap::kSingleAlloc_Strategy, 64 * 1024);
+    GrVkAlloc alloc0, alloc1, alloc2, alloc3;
+    const VkDeviceSize kAlignment = 64;
+    const uint32_t kMemType = 0;
+    const uint32_t kHeapIndex = 0;
+
+    REPORTER_ASSERT(reporter, heap.allocSize() == 0 && heap.usedSize() == 0);
+
+    // make a few allocations
+    REPORTER_ASSERT(reporter, heap.alloc(49 * 1024 - 3, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.alloc(5 * 1024 - 37, kAlignment, kMemType, kHeapIndex, &alloc1));
+    REPORTER_ASSERT(reporter, heap.alloc(15 * 1024 - 11, kAlignment, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.alloc(3 * 1024 - 29, kAlignment, kMemType, kHeapIndex, &alloc3));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 72 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 23 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 8 * 1024);
+    // heap should not grow here (first subheap has room)
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 48 * 1024);
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 45 * 1024);
+    // check for exact fit -- heap should not grow here (third subheap has room)
+    REPORTER_ASSERT(reporter, heap.alloc(15 * 1024 - 63, kAlignment, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 60 * 1024);
+    heap.free(alloc2);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 72 * 1024 && heap.usedSize() == 45 * 1024);
+    // heap should grow here (no subheap has room)
+    REPORTER_ASSERT(reporter, heap.alloc(40 * 1024, kAlignment, kMemType, kHeapIndex, &alloc3));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 112 * 1024 && heap.usedSize() == 85 * 1024);
+    heap.free(alloc1);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 112 * 1024 && heap.usedSize() == 80 * 1024);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 112 * 1024 && heap.usedSize() == 40 * 1024);
+    heap.free(alloc3);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 112 * 1024 && heap.usedSize() == 0 * 1024);
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, kAlignment, kMemType, kHeapIndex, &alloc0));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 112 * 1024 && heap.usedSize() == 24 * 1024);
+    // heap should alloc a new subheap because the memory type is different
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, kAlignment, kMemType + 1, kHeapIndex, &alloc1));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 136 * 1024 && heap.usedSize() == 48 * 1024);
+    // heap should alloc a new subheap because the alignment is different
+    REPORTER_ASSERT(reporter, heap.alloc(24 * 1024, 128, kMemType, kHeapIndex, &alloc2));
+    REPORTER_ASSERT(reporter, heap.allocSize() == 160 * 1024 && heap.usedSize() == 72 * 1024);
+    heap.free(alloc1);
+    heap.free(alloc2);
+    heap.free(alloc0);
+    REPORTER_ASSERT(reporter, heap.allocSize() == 160 * 1024 && heap.usedSize() == 0 * 1024);
+}
+
+DEF_GPUTEST_FOR_VULKAN_CONTEXT(VkHeapTests, reporter, ctxInfo) {
+    subheap_test(reporter, ctxInfo.grContext());
+    suballoc_test(reporter, ctxInfo.grContext());
+    singlealloc_test(reporter, ctxInfo.grContext());
+}
+
+#endif