Pateldisolution
diff --git a/‎src/gc/env/gcenv.os.h
Lines changed: 6 additions & 0 deletions b/‎src/gc/env/gcenv.os.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/gc/gc.cpp
Lines changed: 6 additions & 6 deletions b/‎src/gc/gc.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/gc/handletablecache.cpp
Lines changed: 1 addition & 1 deletion b/‎src/gc/handletablecache.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inc/clrhost.h
Lines changed: 1 addition & 0 deletions b/‎src/inc/clrhost.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/inc/yieldprocessornormalized.h
Lines changed: 222 additions & 0 deletions b/‎src/inc/yieldprocessornormalized.h
Lines changed: 222 additions & 0 deletions
diff --git a/‎src/utilcode/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎src/utilcode/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/utilcode/utsem.cpp
Lines changed: 4 additions & 38 deletions b/‎src/utilcode/utsem.cpp
Lines changed: 4 additions & 38 deletions
diff --git a/‎src/utilcode/yieldprocessornormalized.cpp
Lines changed: 10 additions & 0 deletions b/‎src/utilcode/yieldprocessornormalized.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/vm/CMakeLists.txt
Lines changed: 0 additions & 1 deletion b/‎src/vm/CMakeLists.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/vm/common.h
Lines changed: 0 additions & 1 deletion b/‎src/vm/common.h
Lines changed: 0 additions & 1 deletion
@@ -18,6 +18,12 @@
 #undef Sleep
 #endif // Sleep
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT32_MAX
 
 // Critical section used by the GC
 
@@ -1633,7 +1633,7 @@ void WaitLongerNoInstru (int i)
     {
         if  (g_num_processors > 1)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if  (i & 0x01f)
                 GCToOSInterface::YieldThread (0);
             else
@@ -1706,7 +1706,7 @@ static void enter_spin_lock_noinstru (RAW_KEYWORD(volatile) int32_t* lock)
                     {
                         if  (VolatileLoad(lock) < 0 || IsGCInProgress())
                             break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                     }
                     if  (VolatileLoad(lock) >= 0 && !IsGCInProgress())
                     {
@@ -1801,7 +1801,7 @@ void WaitLonger (int i
 #endif //SYNCHRONIZATION_STATS
         if  (g_num_processors > 1)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if  (i & 0x01f)
                 GCToOSInterface::YieldThread (0);
             else
@@ -1852,7 +1852,7 @@ static void enter_spin_lock (GCSpinLock* spin_lock)
                     {
                         if  (spin_lock->lock < 0 || gc_heap::gc_started)
                             break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                     }
                     if  (spin_lock->lock >= 0 && !gc_heap::gc_started)
                     {
@@ -10332,7 +10332,7 @@ gc_heap::enter_gc_done_event_lock()
                 {
                     if  (gc_done_event_lock < 0)
                         break;
-                    YieldProcessor();           // indicate to the processor that we are spining
+                    YieldProcessor();           // indicate to the processor that we are spinning
                 }
                 if  (gc_done_event_lock >= 0)
                     GCToOSInterface::YieldThread(++dwSwitchCount);
@@ -36251,7 +36251,7 @@ void CFinalize::EnterFinalizeLock()
         unsigned int i = 0;
         while (lock >= 0)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if (++i & 7)
                 GCToOSInterface::YieldThread (0);
             else
 
@@ -103,7 +103,7 @@ void SpinUntil(void *pCond, BOOL fNonZero)
         else
         {
             // nope - just spin again
-            YieldProcessor();           // indicate to the processor that we are spining 
+            YieldProcessor();           // indicate to the processor that we are spinning 
             uNonSleepSpins--;
         }
     }
 
@@ -22,6 +22,7 @@
 #include "predeftlsslot.h"
 #include "safemath.h"
 #include "debugreturn.h"
+#include "yieldprocessornormalized.h"
 
 #if !defined(_DEBUG_IMPL) && defined(_DEBUG) && !defined(DACCESS_COMPILE)
 #define _DEBUG_IMPL 1
 
@@ -0,0 +1,222 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
+FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
+#endif
+#define YieldProcessor Dont_Use_YieldProcessor
+
+const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+
+extern unsigned int g_yieldsPerNormalizedYield;
+extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    unsigned int yieldsPerNormalizedYield;
+    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
+    unsigned int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+};
+
+// See YieldProcessorNormalized() for preliminary info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
+// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
+//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
+//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
+//     and decrease scalability of the operation.
+//         while(!condition)
+//         {
+//             YieldProcessorNormalized();
+//         }
+//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
+//     condition, otherwise it may unnecessarily increase latency of the operation
+//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
+//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
+//     issue above on later iterations.
+//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
+//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
+//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
+FORCEINLINE void YieldProcessorNormalized()
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
+}
+
+// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
+//     if (!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo, 2);
+//         } while (!moreExpensiveCondition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
+{
+    _ASSERTE(count != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (count > MaxCount)
+        {
+            count = MaxCount;
+        }
+    }
+
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
+// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
+//     while(!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalized(2);
+//     }
+FORCEINLINE void YieldProcessorNormalized(unsigned int count)
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
+}
+
+// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
+// info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int preSkylakeCount)
+{
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
+// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
+// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
+// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
+//     while(!condition)
+//     {
+//         YieldProcessorNormalizedForPreSkylakeCount(100);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
+{
+    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
+// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
+// iteration exponentially up to a limit. Typical usage:
+//     if (!conditionThatMayNotBeSatisfiedSoon)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
+//         } while (!conditionThatMayNotBeSatisfiedSoon);
+//     }
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
+    // InitializeYieldProcessorNormalized()
+    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
+    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    // This shift value should be adjusted based on the asserted condition below
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    unsigned int n;
+    if (spinIteration <= MaxShift &&
+        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
@@ -55,6 +55,7 @@ set(UTILCODE_COMMON_SOURCES
   pedecoder.cpp
   winfix.cpp
   longfilepathwrappers.cpp
+  yieldprocessornormalized.cpp
 )
 
 # These source file do not yet compile on Linux.
 
@@ -232,25 +232,8 @@ HRESULT UTSemReadWrite::LockRead()
             }
 
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
         } while (i < g_SpinConstants.dwMaximumDuration);
@@ -341,25 +324,8 @@ HRESULT UTSemReadWrite::LockWrite()
             }
 
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
         } while (i < g_SpinConstants.dwMaximumDuration);
 
@@ -0,0 +1,10 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "stdafx.h"
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
@@ -232,7 +232,6 @@ set(VM_HEADERS_DAC_AND_WKS_COMMON
     versionresilienthashcode.h
     virtualcallstub.h
     win32threadpool.h
-    yieldprocessornormalized.h
     zapsig.h
 )
 
 
@@ -309,7 +309,6 @@ namespace Loader
 #include "pedecoder.h"
 #include "sstring.h"
 #include "slist.h"
-#include "yieldprocessornormalized.h"
 
 #include "eeconfig.h"
Original file line number	Diff line number	Diff line change
`@@ -1633,7 +1633,7 @@ void WaitLongerNoInstru (int i)`
`1633`	`1633`	`{`
`1634`	`1634`	`if (g_num_processors > 1)`
`1635`	`1635`	`{`
`1636`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`1636`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`1637`	`1637`	`if (i & 0x01f)`
`1638`	`1638`	`GCToOSInterface::YieldThread (0);`
`1639`	`1639`	`else`
`@@ -1706,7 +1706,7 @@ static void enter_spin_lock_noinstru (RAW_KEYWORD(volatile) int32_t* lock)`
`1706`	`1706`	`{`
`1707`	`1707`	`if (VolatileLoad(lock) < 0 \|\| IsGCInProgress())`
`1708`	`1708`	`break;`
`1709`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`1709`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`1710`	`1710`	`}`
`1711`	`1711`	`if (VolatileLoad(lock) >= 0 && !IsGCInProgress())`
`1712`	`1712`	`{`
`@@ -1801,7 +1801,7 @@ void WaitLonger (int i`
`1801`	`1801`	`#endif //SYNCHRONIZATION_STATS`
`1802`	`1802`	`if (g_num_processors > 1)`
`1803`	`1803`	`{`
`1804`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`1804`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`1805`	`1805`	`if (i & 0x01f)`
`1806`	`1806`	`GCToOSInterface::YieldThread (0);`
`1807`	`1807`	`else`
`@@ -1852,7 +1852,7 @@ static void enter_spin_lock (GCSpinLock* spin_lock)`
`1852`	`1852`	`{`
`1853`	`1853`	`if (spin_lock->lock < 0 \|\| gc_heap::gc_started)`
`1854`	`1854`	`break;`
`1855`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`1855`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`1856`	`1856`	`}`
`1857`	`1857`	`if (spin_lock->lock >= 0 && !gc_heap::gc_started)`
`1858`	`1858`	`{`
`@@ -10332,7 +10332,7 @@ gc_heap::enter_gc_done_event_lock()`
`10332`	`10332`	`{`
`10333`	`10333`	`if (gc_done_event_lock < 0)`
`10334`	`10334`	`break;`
`10335`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`10335`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`10336`	`10336`	`}`
`10337`	`10337`	`if (gc_done_event_lock >= 0)`
`10338`	`10338`	`GCToOSInterface::YieldThread(++dwSwitchCount);`
`@@ -36251,7 +36251,7 @@ void CFinalize::EnterFinalizeLock()`
`36251`	`36251`	`unsigned int i = 0;`
`36252`	`36252`	`while (lock >= 0)`
`36253`	`36253`	`{`
`36254`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`36254`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`36255`	`36255`	`if (++i & 7)`
`36256`	`36256`	`GCToOSInterface::YieldThread (0);`
`36257`	`36257`	`else`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ void SpinUntil(void *pCond, BOOL fNonZero)`
`103`	`103`	`else`
`104`	`104`	`{`
`105`	`105`	`// nope - just spin again`
`106`		`- YieldProcessor(); // indicate to the processor that we are spining`
	`106`	`+ YieldProcessor(); // indicate to the processor that we are spinning`
`107`	`107`	`uNonSleepSpins--;`
`108`	`108`	`}`
`109`	`109`	`}`
Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ set(UTILCODE_COMMON_SOURCES`
`55`	`55`	`pedecoder.cpp`
`56`	`56`	`winfix.cpp`
`57`	`57`	`longfilepathwrappers.cpp`
	`58`	`+ yieldprocessornormalized.cpp`
`58`	`59`	`)`
`59`	`60`
`60`	`61`	`# These source file do not yet compile on Linux.`
Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,6 @@ set(VM_HEADERS_DAC_AND_WKS_COMMON`
`232`	`232`	`versionresilienthashcode.h`
`233`	`233`	`virtualcallstub.h`
`234`	`234`	`win32threadpool.h`
`235`		`- yieldprocessornormalized.h`
`236`	`235`	`zapsig.h`
`237`	`236`	`)`
`238`	`237`