Skip to content

Commit 616fea5

Browse files
authored
Normalize a few more spin-wait loops (dotnet#21586)
Normalize a few more spin-wait loops - Fixed a few more spin-waits to normalize the spin-wait duration between processors - These spin-waits have so far not needed to be retuned to avoid unreasonably long spin-wait durations. They can be retuned as necessary in the future. - Added a version of YieldProcessorNormalized() that normalizes based on spin-wait counts tuned for pre-Skylake processors for spin-wait loops that have not been retuned. - Moved some files around to make YieldProcessorNormalized() and the like available in more places. Initialization is still only done in the VM. Uses outside the VM will use the defaults, where there would be no significant change from before. - Made YieldProcessor() private outside of the GC and added System_YieldProcessor() for when the system-defined implementation is intended to be used
1 parent 459b58a commit 616fea5

21 files changed

+275
-194
lines changed

src/gc/env/gcenv.os.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
#undef Sleep
1919
#endif // Sleep
2020

21+
#ifdef HAS_SYSTEM_YIELDPROCESSOR
22+
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
23+
#undef YieldProcessor
24+
#define YieldProcessor System_YieldProcessor
25+
#endif
26+
2127
#define NUMA_NODE_UNDEFINED UINT32_MAX
2228

2329
// Critical section used by the GC

src/gc/gc.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,7 +1633,7 @@ void WaitLongerNoInstru (int i)
16331633
{
16341634
if (g_num_processors > 1)
16351635
{
1636-
YieldProcessor(); // indicate to the processor that we are spining
1636+
YieldProcessor(); // indicate to the processor that we are spinning
16371637
if (i & 0x01f)
16381638
GCToOSInterface::YieldThread (0);
16391639
else
@@ -1706,7 +1706,7 @@ static void enter_spin_lock_noinstru (RAW_KEYWORD(volatile) int32_t* lock)
17061706
{
17071707
if (VolatileLoad(lock) < 0 || IsGCInProgress())
17081708
break;
1709-
YieldProcessor(); // indicate to the processor that we are spining
1709+
YieldProcessor(); // indicate to the processor that we are spinning
17101710
}
17111711
if (VolatileLoad(lock) >= 0 && !IsGCInProgress())
17121712
{
@@ -1801,7 +1801,7 @@ void WaitLonger (int i
18011801
#endif //SYNCHRONIZATION_STATS
18021802
if (g_num_processors > 1)
18031803
{
1804-
YieldProcessor(); // indicate to the processor that we are spining
1804+
YieldProcessor(); // indicate to the processor that we are spinning
18051805
if (i & 0x01f)
18061806
GCToOSInterface::YieldThread (0);
18071807
else
@@ -1852,7 +1852,7 @@ static void enter_spin_lock (GCSpinLock* spin_lock)
18521852
{
18531853
if (spin_lock->lock < 0 || gc_heap::gc_started)
18541854
break;
1855-
YieldProcessor(); // indicate to the processor that we are spining
1855+
YieldProcessor(); // indicate to the processor that we are spinning
18561856
}
18571857
if (spin_lock->lock >= 0 && !gc_heap::gc_started)
18581858
{
@@ -10332,7 +10332,7 @@ gc_heap::enter_gc_done_event_lock()
1033210332
{
1033310333
if (gc_done_event_lock < 0)
1033410334
break;
10335-
YieldProcessor(); // indicate to the processor that we are spining
10335+
YieldProcessor(); // indicate to the processor that we are spinning
1033610336
}
1033710337
if (gc_done_event_lock >= 0)
1033810338
GCToOSInterface::YieldThread(++dwSwitchCount);
@@ -36251,7 +36251,7 @@ void CFinalize::EnterFinalizeLock()
3625136251
unsigned int i = 0;
3625236252
while (lock >= 0)
3625336253
{
36254-
YieldProcessor(); // indicate to the processor that we are spining
36254+
YieldProcessor(); // indicate to the processor that we are spinning
3625536255
if (++i & 7)
3625636256
GCToOSInterface::YieldThread (0);
3625736257
else

src/gc/handletablecache.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ void SpinUntil(void *pCond, BOOL fNonZero)
103103
else
104104
{
105105
// nope - just spin again
106-
YieldProcessor(); // indicate to the processor that we are spining
106+
YieldProcessor(); // indicate to the processor that we are spinning
107107
uNonSleepSpins--;
108108
}
109109
}

src/inc/clrhost.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "predeftlsslot.h"
2323
#include "safemath.h"
2424
#include "debugreturn.h"
25+
#include "yieldprocessornormalized.h"
2526

2627
#if !defined(_DEBUG_IMPL) && defined(_DEBUG) && !defined(DACCESS_COMPILE)
2728
#define _DEBUG_IMPL 1

src/inc/yieldprocessornormalized.h

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
#pragma once
6+
7+
// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
8+
// the intention is to use the system-default implementation of YieldProcessor().
9+
#define HAS_SYSTEM_YIELDPROCESSOR
10+
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
11+
#ifdef YieldProcessor
12+
#undef YieldProcessor
13+
#endif
14+
#define YieldProcessor Dont_Use_YieldProcessor
15+
16+
const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
17+
const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
18+
19+
extern unsigned int g_yieldsPerNormalizedYield;
20+
extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
21+
22+
void InitializeYieldProcessorNormalizedCrst();
23+
void EnsureYieldProcessorNormalizedInitialized();
24+
25+
class YieldProcessorNormalizationInfo
26+
{
27+
private:
28+
unsigned int yieldsPerNormalizedYield;
29+
unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
30+
unsigned int optimalMaxYieldsPerSpinIteration;
31+
32+
public:
33+
YieldProcessorNormalizationInfo()
34+
: yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
35+
optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
36+
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
37+
{
38+
}
39+
40+
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
41+
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
42+
friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
43+
friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
44+
};
45+
46+
// See YieldProcessorNormalized() for preliminary info. Typical usage:
47+
// if (!condition)
48+
// {
49+
// YieldProcessorNormalizationInfo normalizationInfo;
50+
// do
51+
// {
52+
// YieldProcessorNormalized(normalizationInfo);
53+
// } while (!condition);
54+
// }
55+
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
56+
{
57+
unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
58+
_ASSERTE(n != 0);
59+
do
60+
{
61+
System_YieldProcessor();
62+
} while (--n != 0);
63+
}
64+
65+
// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
66+
// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
67+
// - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
68+
// for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
69+
// and decrease scalability of the operation.
70+
// while(!condition)
71+
// {
72+
// YieldProcessorNormalized();
73+
// }
74+
// - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
75+
// condition, otherwise it may unnecessarily increase latency of the operation
76+
// - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
77+
// yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
78+
// issue above on later iterations.
79+
// - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
80+
// issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
81+
// System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
82+
FORCEINLINE void YieldProcessorNormalized()
83+
{
84+
YieldProcessorNormalized(YieldProcessorNormalizationInfo());
85+
}
86+
87+
// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
88+
// if (!moreExpensiveCondition)
89+
// {
90+
// YieldProcessorNormalizationInfo normalizationInfo;
91+
// do
92+
// {
93+
// YieldProcessorNormalized(normalizationInfo, 2);
94+
// } while (!moreExpensiveCondition);
95+
// }
96+
FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
97+
{
98+
_ASSERTE(count != 0);
99+
100+
if (sizeof(SIZE_T) <= sizeof(unsigned int))
101+
{
102+
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
103+
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
104+
const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
105+
if (count > MaxCount)
106+
{
107+
count = MaxCount;
108+
}
109+
}
110+
111+
SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
112+
_ASSERTE(n != 0);
113+
do
114+
{
115+
System_YieldProcessor();
116+
} while (--n != 0);
117+
}
118+
119+
// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
120+
// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
121+
// while(!moreExpensiveCondition)
122+
// {
123+
// YieldProcessorNormalized(2);
124+
// }
125+
FORCEINLINE void YieldProcessorNormalized(unsigned int count)
126+
{
127+
YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
128+
}
129+
130+
// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
131+
// info. Typical usage:
132+
// if (!condition)
133+
// {
134+
// YieldProcessorNormalizationInfo normalizationInfo;
135+
// do
136+
// {
137+
// YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
138+
// } while (!condition);
139+
// }
140+
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
141+
const YieldProcessorNormalizationInfo &normalizationInfo,
142+
unsigned int preSkylakeCount)
143+
{
144+
_ASSERTE(preSkylakeCount != 0);
145+
146+
if (sizeof(SIZE_T) <= sizeof(unsigned int))
147+
{
148+
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
149+
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
150+
const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
151+
if (preSkylakeCount > MaxCount)
152+
{
153+
preSkylakeCount = MaxCount;
154+
}
155+
}
156+
157+
const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
158+
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
159+
if (n == 0)
160+
{
161+
n = 1;
162+
}
163+
do
164+
{
165+
System_YieldProcessor();
166+
} while (--n != 0);
167+
}
168+
169+
// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
170+
// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
171+
// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
172+
// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
173+
// while(!condition)
174+
// {
175+
// YieldProcessorNormalizedForPreSkylakeCount(100);
176+
// }
177+
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
178+
{
179+
YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
180+
}
181+
182+
// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
183+
// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
184+
// iteration exponentially up to a limit. Typical usage:
185+
// if (!conditionThatMayNotBeSatisfiedSoon)
186+
// {
187+
// YieldProcessorNormalizationInfo normalizationInfo;
188+
// do
189+
// {
190+
// YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
191+
// } while (!conditionThatMayNotBeSatisfiedSoon);
192+
// }
193+
FORCEINLINE void YieldProcessorWithBackOffNormalized(
194+
const YieldProcessorNormalizationInfo &normalizationInfo,
195+
unsigned int spinIteration)
196+
{
197+
// normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
198+
// InitializeYieldProcessorNormalized()
199+
const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
200+
NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
201+
_ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
202+
203+
// This shift value should be adjusted based on the asserted condition below
204+
const UINT8 MaxShift = 3;
205+
static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
206+
207+
unsigned int n;
208+
if (spinIteration <= MaxShift &&
209+
((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
210+
{
211+
n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
212+
}
213+
else
214+
{
215+
n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
216+
}
217+
_ASSERTE(n != 0);
218+
do
219+
{
220+
System_YieldProcessor();
221+
} while (--n != 0);
222+
}

src/utilcode/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ set(UTILCODE_COMMON_SOURCES
5555
pedecoder.cpp
5656
winfix.cpp
5757
longfilepathwrappers.cpp
58+
yieldprocessornormalized.cpp
5859
)
5960

6061
# These source file do not yet compile on Linux.

src/utilcode/utsem.cpp

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -232,25 +232,8 @@ HRESULT UTSemReadWrite::LockRead()
232232
}
233233

234234
// Delay by approximately 2*i clock cycles (Pentium III).
235-
// This is brittle code - future processors may of course execute this
236-
// faster or slower, and future code generators may eliminate the loop altogether.
237-
// The precise value of the delay is not critical, however, and I can't think
238-
// of a better way that isn't machine-dependent.
239-
int sum = 0;
240-
241-
for (int delayCount = i; --delayCount; )
242-
{
243-
sum += delayCount;
244-
YieldProcessor(); // indicate to the processor that we are spining
245-
}
246-
247-
if (sum == 0)
248-
{
249-
// never executed, just to fool the compiler into thinking sum is live here,
250-
// so that it won't optimize away the loop.
251-
static char dummy;
252-
dummy++;
253-
}
235+
YieldProcessorNormalizedForPreSkylakeCount(i);
236+
254237
// exponential backoff: wait a factor longer in the next iteration
255238
i *= g_SpinConstants.dwBackoffFactor;
256239
} while (i < g_SpinConstants.dwMaximumDuration);
@@ -341,25 +324,8 @@ HRESULT UTSemReadWrite::LockWrite()
341324
}
342325

343326
// Delay by approximately 2*i clock cycles (Pentium III).
344-
// This is brittle code - future processors may of course execute this
345-
// faster or slower, and future code generators may eliminate the loop altogether.
346-
// The precise value of the delay is not critical, however, and I can't think
347-
// of a better way that isn't machine-dependent.
348-
int sum = 0;
349-
350-
for (int delayCount = i; --delayCount; )
351-
{
352-
sum += delayCount;
353-
YieldProcessor(); // indicate to the processor that we are spining
354-
}
355-
356-
if (sum == 0)
357-
{
358-
// never executed, just to fool the compiler into thinking sum is live here,
359-
// so that it won't optimize away the loop.
360-
static char dummy;
361-
dummy++;
362-
}
327+
YieldProcessorNormalizedForPreSkylakeCount(i);
328+
363329
// exponential backoff: wait a factor longer in the next iteration
364330
i *= g_SpinConstants.dwBackoffFactor;
365331
} while (i < g_SpinConstants.dwMaximumDuration);
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
#include "stdafx.h"
6+
7+
// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
8+
// tuned for Skylake processors
9+
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
10+
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

src/vm/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ set(VM_HEADERS_DAC_AND_WKS_COMMON
232232
versionresilienthashcode.h
233233
virtualcallstub.h
234234
win32threadpool.h
235-
yieldprocessornormalized.h
236235
zapsig.h
237236
)
238237

src/vm/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,6 @@ namespace Loader
309309
#include "pedecoder.h"
310310
#include "sstring.h"
311311
#include "slist.h"
312-
#include "yieldprocessornormalized.h"
313312

314313
#include "eeconfig.h"
315314

0 commit comments

Comments
 (0)