summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch')
-rw-r--r--dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch67
1 files changed, 67 insertions, 0 deletions
diff --git a/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch b/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch
new file mode 100644
index 000000000000..ae2092f6e1ac
--- /dev/null
+++ b/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch
@@ -0,0 +1,67 @@
+Fix SIGSEGV when compiled with avx-512 instructions.
+
+Due to unaligned allocations, library crashes in
+nontemporalMemcpy in _mm512_stream_si512 (which requires
+64-aligned allocations, but used to copy default-aligned objects).
+
+Without this patch hipamd causes random crashes in hipMemcpy* callers
+(tensile, rocBLAS, miopen, rocThrust, etc.).
+
+Bug: https://bugs.gentoo.org/915969
+Bug report in upstream: https://github.com/ROCm-Developer-Tools/clr/issues/18
+--- a/rocclr/device/rocm/rocvirtual.cpp
++++ b/rocclr/device/rocm/rocvirtual.cpp
+@@ -2790,44 +2790,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
+ return true;
+ }
+
+-// ================================================================================================
+-__attribute__((optimize("unroll-all-loops"), always_inline))
+-static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
+- uint16_t size) {
+- #if defined(__AVX512F__)
+- for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
+- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
+- *reinterpret_cast<const __m512i* __restrict&>(src)++);
+- }
+- size = size % sizeof(__m512i);
+- #endif
+-
+- #if defined(__AVX__)
+- for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
+- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
+- *reinterpret_cast<const __m256i* __restrict&>(src)++);
+- }
+- size = size % sizeof(__m256i);
+- #endif
+-
+- for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
+- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
+- *(reinterpret_cast<const __m128i* __restrict&>(src)++));
+- }
+- size = size % sizeof(__m128i);
+-
+- for (auto i = 0u; i != size / sizeof(long long); ++i) {
+- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
+- *reinterpret_cast<const long long* __restrict&>(src)++);
+- }
+- size = size % sizeof(long long);
+-
+- for (auto i = 0u; i != size / sizeof(int); ++i) {
+- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
+- *reinterpret_cast<const int* __restrict&>(src)++);
+- }
+-}
+-
+ // ================================================================================================
+ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
+ const amd::Kernel& kernel, const_address parameters, void* eventHandle,
+@@ -3096,7 +3058,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
+ argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(),
+ gpuKernel.KernargSegmentAlignment()));
+ // Load all kernel arguments
+- nontemporalMemcpy(argBuffer, parameters,
++ memcpy(argBuffer, parameters,
+ std::min(gpuKernel.KernargSegmentByteSize(),
+ signature.paramsSize()));
+ }