diff options
Diffstat (limited to 'dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch')
-rw-r--r-- | dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch b/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch new file mode 100644 index 000000000000..ae2092f6e1ac --- /dev/null +++ b/dev-util/hip/files/hip-5.7.1-fix-unaligned-access.patch @@ -0,0 +1,67 @@ +Fix SIGSEGV when compiled with avx-512 instructions. + +Due to unaligned allocations, library crashes in +nontemporalMemcpy in _mm512_stream_si512 (which requires +64-aligned allocations, but used to copy default-aligned objects). + +Without this patch hipamd causes random crashes in hipMemcpy* callers +(tensile, rocBLAS, miopen, rocThrust, etc.). + +Bug: https://bugs.gentoo.org/915969 +Bug report in upstream: https://github.com/ROCm-Developer-Tools/clr/issues/18 +--- a/rocclr/device/rocm/rocvirtual.cpp ++++ b/rocclr/device/rocm/rocvirtual.cpp +@@ -2790,44 +2790,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) + return true; + } + +-// ================================================================================================ +-__attribute__((optimize("unroll-all-loops"), always_inline)) +-static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src, +- uint16_t size) { +- #if defined(__AVX512F__) +- for (auto i = 0u; i != size / sizeof(__m512i); ++i) { +- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++, +- *reinterpret_cast<const __m512i* __restrict&>(src)++); +- } +- size = size % sizeof(__m512i); +- #endif +- +- #if defined(__AVX__) +- for (auto i = 0u; i != size / sizeof(__m256i); ++i) { +- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++, +- *reinterpret_cast<const __m256i* __restrict&>(src)++); +- } +- size = size % sizeof(__m256i); +- #endif +- +- for (auto i = 0u; i != size / sizeof(__m128i); ++i) { +- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++, +- *(reinterpret_cast<const __m128i* __restrict&>(src)++)); +- } +- size = size % sizeof(__m128i); +- +- for (auto i = 0u; i != size / sizeof(long long); ++i) { +- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++, +- *reinterpret_cast<const long long* __restrict&>(src)++); +- } +- size = size % sizeof(long long); +- +- for (auto i = 0u; i != size / sizeof(int); ++i) { +- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++, +- *reinterpret_cast<const int* __restrict&>(src)++); +- } +-} +- + // ================================================================================================ + bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, + const amd::Kernel& kernel, const_address parameters, void* eventHandle, +@@ -3096,7 +3058,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, + argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment())); + // Load all kernel arguments +- nontemporalMemcpy(argBuffer, parameters, ++ memcpy(argBuffer, parameters, + std::min(gpuKernel.KernargSegmentByteSize(), + signature.paramsSize())); + } |