處理這個死結問題,花了好幾天,相信遇到的同學,一樣頭疼,但有個好輔助類的話(好在.net的API足夠強大),就沒這麼頭疼了 注意
本篇文章的解決方案只適合使用lock(obj),或是:Monitor.Enter(obj); …. Monitor.Exit(obj)的方式
類似醬紫的死結
如果你使用的是:AutoResetEvent.Set/Rest, Monitor.Wait/Pulse, Mutex的方式,請另尋他法。 輔助類
//------------------------------------------------------------------------// Craeted by Jave.Lin 4/21/2018 5:31:57 PM//------------------------------------------------------------------------using System;using System.Collections.Generic;using System.Threading;//namespace Common.ComUtil//{ /// <summary> /// Locker Information /// author : Jave.Lin /// date : 4/21/2018 5:31:57 PM /// </summary> public class Locker { public object tag; // 附帶上下文資料 public int threadID; // 擷取鎖的線程ID public string name; // 鎖的名稱 public int lockedTimes; // 累積擷取鎖多少次,便於分析死結幾率 public int lockingTs; // 擷取鎖那刻的時間戳記 public bool enter; // 擷取鎖的標記 public bool exit; // 釋放鎖的標記 public string lockingStackTrace; // 擷取鎖那刻的調用方法棧,下文Dumps資訊執行個體可以看到很清楚 public int lockingDLTs; // 識別為死結時設定的時間戳記 public List<Locker> pRecordList; public Locker(string name) { this.name = name; pRecordList = new List<Locker>(); } public void PushRecord() { pRecordList.Add(RetrieveRecord()); } public void ClearRecord() { pRecordList.Clear(); } public Locker RetrieveRecord() { var ret = new Locker(this.name); ret.tag = tag; ret.threadID = threadID; ret.name = name; ret.lockedTimes = lockedTimes; ret.lockingTs = lockingTs; ret.enter = enter; ret.exit = exit; ret.lockingStackTrace = lockingStackTrace; ret.lockingDLTs = lockingDLTs; return ret; } public override string ToString() { string[] strs = new string[] { "ThreadId:" + threadID, "Name:" + name, "LockedTimes:" + lockedTimes, "LockingTs:" + lockingTs, "LocingEt:" + (lockingDLTs - lockingTs) + ("(ms"), "Enter:" + enter, "Exit:" + exit, "Tag:" + (tag != null ? tag.ToString() : "null"), "LockingStackTrace:\n" + lockingStackTrace, (pRecordList.Count > 0 ? "LockedRecord:\n\t" + string.Join("\t-record--------------", pRecordList) : "") }; return string.Join("\r\n", strs); } } public delegate void OnDL(); /// <summary> /// Check Dead Lock (CDL) /// author : Jave.Lin /// date : 4/21/2018 5:31:57 PM /// </summary> public static class CDL { // had been locked in map public static readonly Dictionary<Locker, bool> _s_pLockedMap = new Dictionary<Locker, bool>(); // the last public static readonly Dictionary<Locker, List<Locker>> _s_pLockingMap = new Dictionary<Locker, List<Locker>>(); public const bool THROW_ER = true; public const int DEAD_LOCK_TIME_OUT = 3000; // 這個閥值按需調整,實際的線上產品伺服器程式如果負載過大時,可能也會有部分任務處理過久,導致‘取鎖’等待過久 public static event OnDL OnDLEvent; private static void _PushToWaitQueue(Locker locker) { List<Locker> list = null; if (!_s_pLockingMap.TryGetValue(locker, out list)) { list = new List<Locker>(); _s_pLockingMap[locker] = list; } list.Add(locker.RetrieveRecord()); } private static void _ClearFromLocking(Locker locker) { List<Locker> list = null; if (_s_pLockingMap.TryGetValue(locker, out list)) { list.Clear(); _s_pLockingMap.Remove(locker); } } private static void _BeforeEnter(Locker locker) { if (locker.enter) { locker.PushRecord(); _PushToWaitQueue(locker); } } private static void _Enter(Locker locker) { locker.enter = true; locker.exit = false; locker.lockingTs = Environment.TickCount; locker.threadID = Thread.CurrentThread.ManagedThreadId; locker.lockingStackTrace = GetCurStackTrace("->\n"); Interlocked.Increment(ref locker.lockedTimes); _s_pLockedMap[locker] = true; } private static void _Exit(Locker locker) { if (!Monitor.IsEntered(locker)) { locker.lockingTs = Environment.TickCount; var msg = "!Monitor.IsEntered(locker)"; if (THROW_ER) { throw new Exception(msg); } else { _WarningWriteLine(msg); } } else { locker.exit = true; _s_pLockedMap.Remove(locker); _ClearFromLocking(locker); locker.ClearRecord(); Monitor.Exit(locker); } } private static string _GetWaitQueue(Locker locker) { if (_s_pLockingMap.ContainsKey(locker)) { return string.Join("\n@@@@@", _s_pLockingMap[locker]); } return ""; } public static string Dumps() { var itemList = new List<Locker>(); var contentList = new List<string>(); foreach (var item in _s_pLockedMap) { itemList.Add(item.Key); } itemList.Sort((a, b) => { return (b.lockingDLTs - b.lockingTs) - (a.lockingDLTs - a.lockingTs); }); foreach (var item in itemList) { contentList.Add(item.ToString() + "\n$$$$$$$$$$Before Locking WaitQueue$$$$$$$$\n" + _GetWaitQueue(item)); } return string.Join("\r\n=line============\r\n", contentList); } public static void CheckDL(Locker locker, Action actoin) { try { _BeforeEnter(locker); if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT)) { _Enter(locker); actoin.Invoke(); } else { locker.lockingDLTs = Environment.TickCount; _WarningWriteLine("TryEnter time out"); if (THROW_ER) { _ShowGetLockTimeout(); } else { actoin.Invoke(); } } } catch (Exception e) { _ErrorWriteLine(e.ToString()); } finally { _Exit(locker); } } public static T CheckDL<T>(Locker locker, Func<T> actoin) { T ret = default(T); try { _BeforeEnter(locker); if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT)) { _Enter(locker); ret = actoin.Invoke(); } else { locker.lockingDLTs = Environment.TickCount; _WarningWriteLine("TryEnter time out"); if (THROW_ER) { _ShowGetLockTimeout(); } else { actoin.Invoke(); } } } catch (Exception e) { _ErrorWriteLine(e.ToString()); } finally { _Exit(locker); } return ret; } public static string GetCurStackTrace(string separactor = "->") { System.Diagnostics.StackTrace st = new System.Diagnostics.StackTrace(); System.Diagnostics.StackFrame[] sfs = st.GetFrames(); List<string> methodNameList = new List<string>(); for (int i = 1; i < sfs.Length; ++i) { if (System.Diagnostics.StackFrame.OFFSET_UNKNOWN == sfs[i].GetILOffset()) break; var m = sfs[i].GetMethod(); var dn = m.DeclaringType.Name; var mn = m.Name; methodNameList.Add(new string(' ', sfs.Length - i) + dn + "::" + mn + "()"); } methodNameList.Reverse(); return string.Join(separactor, methodNameList); } private static void _ShowGetLockTimeout() { OnDLEvent?.Invoke(); var msg = "!!!!!!!!!!!!!!!!!!DeadLock!!!!!!!!!!!!!!!!!!!!!!!!!!"; _ErrorWriteLine(msg); throw new Exception(msg); } private static void _ErrorWriteLine(string msg, params object[] args) { var srcColor = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(msg, args); Console.ForegroundColor = srcColor; } private static void _WarningWriteLine(string msg, params object[] args) { var srcColor = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(msg, args); Console.ForegroundColor = srcColor; } }//}
使用方法要點、注意副作用、處理副作用
使用方法要點
// 我們平常用的:lock(obj){ // code here}改寫成:CDL.CheckDL(obj, ()=>{ // code here});// 怎麼方便改,是個問題// 用到的lock少的話,手動一個個改吧// 如果巨量的話,建議CTRL + SHIFT + H來批量替換Lock的代碼吧(寫個正則)// 將:CDL的namespace去掉,這樣就不用導namespace了。
一旦有死結出現,那麼將會命中CDL::_ShowGetLockTimeout方法
然後將CDL.Dumps()的內容列印出來,就可以知道,當前哪些CDL.CheckDL的地方有死結。
Dumps很詳細,具體還可以根據自己的需要來對 Locker的資訊做調整。 從dumps資訊中辨別死結
Dumps資訊中,辨別哪些是死結狀態的,看:LockingEt的值是多少就知道了
LockingEt是:Locking Elapsed Time的意思,擷取鎖多長時間了
LockingEt值只要大於零,且接近於:CDL.DEAD_LOCK_TIME_OUT的值,都基本上是死結 注意副作用
明顯原來的代碼邏輯會發生改變
多了一些方法調用
特別是將原來的代碼的位置,改變了,放到了一個lambda(其實在IL中是個匿名函數)
先不說結構上變化了,效率也會有丟丟影響的。 處理副作用
還有一個方法可以去出副作用,就是再寫個工具,將編譯出來的DLL,批量處理CDL.CheckDL的IL代碼,改為原來的lock(obj)方式,當然,前提是先需要大量測試後再用這個工具處理,不然如果中途還是出現了死結的話,定位問題還是會很頭疼的。(製作這個工具,理論上是可以的,但需要對IL熟悉) 注意CDL.DEAD_LOCK_TIME_OUT
CDL.DEAD_LOCK_TIME_OUT= 3000; // 這個閥值按需調整,實際的線上產品伺服器程式如果負載過大時,可能也會有部分任務處理過久,導致‘取鎖’等待過久,所以出現Monitor.TryEnter timeout時,不一定是死結。
你可以按你的需求來調整該值,如:調整個:60000(60秒),意思是你確定了,取鎖時間逾時為60秒的,都算是有死結任務導致 CDL.Dumps內容執行個體
ThreadId:10Name:CBCServerAliveLockerLockedTimes:1LockingTs:7496815LockingEt:-7496815(ms // =========辨別死結=======負數的都不用看Enter:TrueExit:FalseTag:nullLockingStackTrace: Program::Main()-> XXXServer::WaitForExit()-> CDL::CheckDL()=line============ThreadId:15Name:CTcpNetworkerLockedTimes:486LockingTs:7519248LockingEt:2995(ms // =辨別死結=此乃死結也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值,由於CBattleRoom的Locker線程ID為6的鎖逾時而導致的Enter:TrueExit:FalseTag:nullLockingStackTrace: _IOCompletionCallback::PerformIOCompletionCallback()-> BaseOverlappedAsyncResult::CompletionPortCallback()-> LazyAsyncResult::ProtectedInvokeCallback()-> ContextAwareResult::Complete()-> ExecutionContext::Run()-> ExecutionContext::Run()-> ExecutionContext::RunInternal()-> ContextAwareResult::CompleteCallback()-> LazyAsyncResult::Complete()-> XXXNetworker::_OnBeginReceiveCallback()-> CDL::CheckDL()=line============ThreadId:15Name:CEventMgrLockedTimes:247LockingTs:7519248LockingEt:-7519248(ms // =========辨別死結=======負數的都不用看Enter:TrueExit:FalseTag:nullLockingStackTrace: _IOCompletionCallback::PerformIOCompletionCallback()-> BaseOverlappedAsyncResult::CompletionPortCallback()-> LazyAsyncResult::ProtectedInvokeCallback()-> ContextAwareResult::Complete()-> ExecutionContext::Run()-> ExecutionContext::Run()-> ExecutionContext::RunInternal()-> ContextAwareResult::CompleteCallback()-> LazyAsyncResult::Complete()-> XXXNetworker::_OnBeginReceiveCallback()-> CDL::CheckDL()-> <>c__DisplayClass78_0::<_OnBeginReceiveCallback>b__0()-> XXXConnection::XXXNetworker_OnPackageEvent()-> CEventMgr::Invoke()-> CDL::CheckDL()=line============ThreadId:6Name:CBattleRoomMgrLockedTimes:689LockingTs:7519248LockingEt:-7519248(ms // =========辨別死結=======負數的都不用看Enter:TrueExit:FalseTag:nullLockingStackTrace: _ThreadPoolWaitCallback::PerformWaitCallback()-> ThreadPoolWorkQueue::Dispatch()-> Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()-> Task::ExecuteEntry()-> Task::ExecuteWithThreadLocal()-> ExecutionContext::Run()-> ExecutionContext::RunInternal()-> Task::ExecutionContextCallback()-> Task::Execute()-> Task::InnerInvoke()-> XXXServer::<StartUp>b__36_0()-> CDL::CheckDL()-> XXXServer::<StartUp>b__36_1()-> XXXServer::_DeadLockMethod()-> XXXServerInst::_DeadLockMethod1()-> CDL::CheckDL()=line============ThreadId:6Name:CBattleRoomLockedTimes:802LockingTs:7519248LockingEt:2995(ms // ==============辨別死結======此乃死結也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值Enter:TrueExit:FalseTag:BroadcastAll3 startingLockingStackTrace: _ThreadPoolWaitCallback::PerformWaitCallback()-> ThreadPoolWorkQueue::Dispatch()-> Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()-> Task::ExecuteEntry()-> Task::ExecuteWithThreadLocal