1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
Fix SIGSEGV when compiled with avx-512 instructions.
Due to unaligned allocations, library crashes in
nontemporalMemcpy in _mm512_stream_si512 (which requires
64-aligned allocations, but used to copy default-aligned objects).
Without this patch hipamd causes random crashes in hipMemcpy* callers
(tensile, rocBLAS, miopen, rocThrust, etc.).
Bug: https://bugs.gentoo.org/915969
Bug report in upstream: https://github.com/ROCm-Developer-Tools/clr/issues/18
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -2790,44 +2790,6 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
return true;
}
-// ================================================================================================
-__attribute__((optimize("unroll-all-loops"), always_inline))
-static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
- uint16_t size) {
- #if defined(__AVX512F__)
- for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
- *reinterpret_cast<const __m512i* __restrict&>(src)++);
- }
- size = size % sizeof(__m512i);
- #endif
-
- #if defined(__AVX__)
- for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
- *reinterpret_cast<const __m256i* __restrict&>(src)++);
- }
- size = size % sizeof(__m256i);
- #endif
-
- for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
- *(reinterpret_cast<const __m128i* __restrict&>(src)++));
- }
- size = size % sizeof(__m128i);
-
- for (auto i = 0u; i != size / sizeof(long long); ++i) {
- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
- *reinterpret_cast<const long long* __restrict&>(src)++);
- }
- size = size % sizeof(long long);
-
- for (auto i = 0u; i != size / sizeof(int); ++i) {
- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
- *reinterpret_cast<const int* __restrict&>(src)++);
- }
-}
-
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
@@ -3096,7 +3058,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
argBuffer = reinterpret_cast<address>(allocKernArg(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment()));
// Load all kernel arguments
- nontemporalMemcpy(argBuffer, parameters,
+ memcpy(argBuffer, parameters,
std::min(gpuKernel.KernargSegmentByteSize(),
signature.paramsSize()));
}
|