Skip to content

Commit 153ea47

Browse files
mihirpat1yxieca
authored andcommitted
Xcvrd should restart if any child thread crashes (#326)
* Xcvrd should restart if any child thread crashes Signed-off-by: Mihir Patel <patelmi@microsoft.com> * Resolved test_SfpStateUpdateTask_task_run_stop test failure * Added comment for raise_exception Signed-off-by: Mihir Patel <patelmi@microsoft.com> * Added check for avoiding cmis_manager.start() if CMIS thread is supposed to be skipped. Also, moidified join function to handle accordingly. Added logs for showing names of threads spawned Signed-off-by: Mihir Patel <patelmi@microsoft.com> Signed-off-by: Mihir Patel <patelmi@microsoft.com>
1 parent 2d546b4 commit 153ea47

File tree

2 files changed

+266
-88
lines changed

2 files changed

+266
-88
lines changed

sonic-xcvrd/tests/test_xcvrd.py

Lines changed: 140 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,95 @@
4040
global_media_settings = media_settings_with_comma_dict['GLOBAL_MEDIA_SETTINGS'].pop('1-32')
4141
media_settings_with_comma_dict['GLOBAL_MEDIA_SETTINGS']['1-5,6,7-20,21-32'] = global_media_settings
4242

43+
class TestXcvrdThreadException(object):
44+
45+
@patch('xcvrd.xcvrd.platform_chassis', MagicMock())
46+
def test_CmisManagerTask_task_run_with_exception(self):
47+
port_mapping = PortMapping()
48+
stop_event = threading.Event()
49+
cmis_manager = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
50+
cmis_manager.wait_for_port_config_done = MagicMock(side_effect = NotImplementedError)
51+
exception_received = None
52+
trace = None
53+
try:
54+
cmis_manager.start()
55+
cmis_manager.join()
56+
except Exception as e1:
57+
exception_received = e1
58+
trace = traceback.format_exc()
59+
60+
assert not cmis_manager.is_alive()
61+
assert(type(exception_received) == NotImplementedError)
62+
assert("NotImplementedError" in str(trace) and "effect" in str(trace))
63+
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
64+
assert("wait_for_port_config_done" in str(trace))
65+
66+
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_config_change', MagicMock(side_effect = NotImplementedError))
67+
def test_DomInfoUpdateTask_task_run_with_exception(self):
68+
port_mapping = PortMapping()
69+
stop_event = threading.Event()
70+
dom_info_update = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
71+
exception_received = None
72+
trace = None
73+
try:
74+
dom_info_update.start()
75+
dom_info_update.join()
76+
except Exception as e1:
77+
exception_received = e1
78+
trace = traceback.format_exc()
79+
80+
assert not dom_info_update.is_alive()
81+
assert(type(exception_received) == NotImplementedError)
82+
assert("NotImplementedError" in str(trace) and "effect" in str(trace))
83+
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
84+
assert("subscribe_port_config_change" in str(trace))
85+
86+
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_config_change', MagicMock(side_effect = NotImplementedError))
87+
def test_SfpStateUpdateTask_task_run_with_exception(self):
88+
port_mapping = PortMapping()
89+
retry_eeprom_set = set()
90+
stop_event = threading.Event()
91+
sfp_error_event = threading.Event()
92+
sfp_state_update = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
93+
exception_received = None
94+
trace = None
95+
try:
96+
sfp_state_update.start()
97+
sfp_state_update.join()
98+
except Exception as e1:
99+
exception_received = e1
100+
trace = traceback.format_exc()
101+
102+
assert not sfp_state_update.is_alive()
103+
assert(type(exception_received) == NotImplementedError)
104+
assert("NotImplementedError" in str(trace) and "effect" in str(trace))
105+
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
106+
assert("subscribe_port_config_change" in str(trace))
107+
108+
@patch('xcvrd.xcvrd.SfpStateUpdateTask.is_alive', MagicMock(return_value = False))
109+
@patch('xcvrd.xcvrd.DomInfoUpdateTask.is_alive', MagicMock(return_value = False))
110+
@patch('xcvrd.xcvrd.CmisManagerTask.is_alive', MagicMock(return_value = False))
111+
@patch('xcvrd.xcvrd.CmisManagerTask.join', MagicMock(side_effect = NotImplementedError))
112+
@patch('xcvrd.xcvrd.CmisManagerTask.start', MagicMock())
113+
@patch('xcvrd.xcvrd.DomInfoUpdateTask.start', MagicMock())
114+
@patch('xcvrd.xcvrd.SfpStateUpdateTask.start', MagicMock())
115+
@patch('xcvrd.xcvrd.DaemonXcvrd.deinit', MagicMock())
116+
@patch('os.kill')
117+
@patch('xcvrd.xcvrd.DaemonXcvrd.init')
118+
@patch('xcvrd.xcvrd.DomInfoUpdateTask.join')
119+
@patch('xcvrd.xcvrd.SfpStateUpdateTask.join')
120+
def test_DaemonXcvrd_run_with_exception(self, mock_task_join1, mock_task_join2, mock_init, mock_os_kill):
121+
mock_init.return_value = (PortMapping(), set())
122+
xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER)
123+
xcvrd.stop_event.wait = MagicMock()
124+
xcvrd.run()
125+
126+
assert len(xcvrd.threads) == 3
127+
assert mock_init.call_count == 1
128+
assert mock_task_join1.call_count == 1
129+
assert mock_task_join2.call_count == 1
130+
assert mock_os_kill.call_count == 1
131+
43132
class TestXcvrdScript(object):
44133

45134
@patch('xcvrd.xcvrd._wrapper_get_sfp_type')
@@ -482,10 +571,10 @@ def test_DaemonXcvrd_wait_for_port_config_done(self, mock_select, mock_sub_table
482571

483572
@patch('xcvrd.xcvrd.DaemonXcvrd.init')
484573
@patch('xcvrd.xcvrd.DaemonXcvrd.deinit')
485-
@patch('xcvrd.xcvrd.DomInfoUpdateTask.task_run')
486-
@patch('xcvrd.xcvrd.SfpStateUpdateTask.task_run')
487-
@patch('xcvrd.xcvrd.DomInfoUpdateTask.task_stop')
488-
@patch('xcvrd.xcvrd.SfpStateUpdateTask.task_stop')
574+
@patch('xcvrd.xcvrd.DomInfoUpdateTask.start')
575+
@patch('xcvrd.xcvrd.SfpStateUpdateTask.start')
576+
@patch('xcvrd.xcvrd.DomInfoUpdateTask.join')
577+
@patch('xcvrd.xcvrd.SfpStateUpdateTask.join')
489578
def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, mock_task_run2, mock_deinit, mock_init):
490579
mock_init.return_value = (PortMapping(), set())
491580
xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER)
@@ -502,7 +591,8 @@ def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1,
502591
@patch('xcvrd.xcvrd._wrapper_get_sfp_type', MagicMock(return_value='QSFP_DD'))
503592
def test_CmisManagerTask_handle_port_change_event(self):
504593
port_mapping = PortMapping()
505-
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping)
594+
stop_event = threading.Event()
595+
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
506596

507597
assert not task.isPortConfigDone
508598
port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET)
@@ -529,7 +619,8 @@ def test_CmisManagerTask_handle_port_change_event(self):
529619
@patch('xcvrd.xcvrd.XcvrTableHelper')
530620
def test_CmisManagerTask_get_configured_freq(self, mock_table_helper):
531621
port_mapping = PortMapping()
532-
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping)
622+
stop_event = threading.Event()
623+
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
533624
cfg_port_tbl = MagicMock()
534625
cfg_port_tbl.get = MagicMock(return_value=(True, (('laser_freq', 193100),)))
535626
mock_table_helper.get_cfg_port_tbl = MagicMock(return_value=cfg_port_tbl)
@@ -539,7 +630,8 @@ def test_CmisManagerTask_get_configured_freq(self, mock_table_helper):
539630
@patch('xcvrd.xcvrd.XcvrTableHelper')
540631
def test_CmisManagerTask_get_configured_tx_power_from_db(self, mock_table_helper):
541632
port_mapping = PortMapping()
542-
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping)
633+
stop_event = threading.Event()
634+
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
543635
cfg_port_tbl = MagicMock()
544636
cfg_port_tbl.get = MagicMock(return_value=(True, (('tx_power', -10),)))
545637
mock_table_helper.get_cfg_port_tbl = MagicMock(return_value=cfg_port_tbl)
@@ -555,11 +647,12 @@ def test_CmisManagerTask_task_run_stop(self, mock_chassis):
555647
mock_chassis.get_all_sfps = MagicMock(return_value=[mock_object, mock_object])
556648

557649
port_mapping = PortMapping()
558-
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping)
559-
task.wait_for_port_config_done = MagicMock()
560-
task.task_run()
561-
task.task_stop()
562-
assert task.task_process is None
650+
stop_event = threading.Event()
651+
cmis_manager = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
652+
cmis_manager.wait_for_port_config_done = MagicMock()
653+
cmis_manager.start()
654+
cmis_manager.join()
655+
assert not cmis_manager.is_alive()
563656

564657
@patch('xcvrd.xcvrd.platform_chassis')
565658
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None)))
@@ -658,7 +751,8 @@ def test_CmisManagerTask_task_worker(self, mock_chassis):
658751
mock_chassis.get_sfp = MagicMock(return_value=mock_sfp)
659752

660753
port_mapping = PortMapping()
661-
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping)
754+
stop_event = threading.Event()
755+
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
662756

663757
port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET)
664758
task.on_port_update_event(port_change_event)
@@ -709,7 +803,8 @@ def test_CmisManagerTask_task_worker(self, mock_chassis):
709803
@patch('xcvrd.xcvrd.delete_port_from_status_table_hw')
710804
def test_DomInfoUpdateTask_handle_port_change_event(self, mock_del_status_tbl_hw):
711805
port_mapping = PortMapping()
712-
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping)
806+
stop_event = threading.Event()
807+
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
713808
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
714809
port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_ADD)
715810
task.on_port_config_change(port_change_event)
@@ -731,10 +826,11 @@ def test_DomInfoUpdateTask_handle_port_change_event(self, mock_del_status_tbl_hw
731826
@patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_config_change', MagicMock())
732827
def test_DomInfoUpdateTask_task_run_stop(self):
733828
port_mapping = PortMapping()
734-
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping)
735-
task.task_run()
736-
task.task_stop()
737-
assert not task.task_thread.is_alive()
829+
stop_event = threading.Event()
830+
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
831+
task.start()
832+
task.join()
833+
assert not task.is_alive()
738834

739835
@patch('xcvrd.xcvrd.XcvrTableHelper', MagicMock())
740836
@patch('xcvrd.xcvrd_utilities.sfp_status_helper.detect_port_in_error_status')
@@ -755,7 +851,8 @@ def test_DomInfoUpdateTask_task_worker(self, mock_post_pm_info, mock_update_stat
755851
mock_sub_table.return_value = mock_selectable
756852

757853
port_mapping = PortMapping()
758-
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping)
854+
stop_event = threading.Event()
855+
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
759856
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
760857
task.task_stopping_event.wait = MagicMock(side_effect=[False, True])
761858
mock_detect_error.return_value = True
@@ -786,10 +883,11 @@ def test_SfpStateUpdateTask_handle_port_change_event(self, mock_update_status_hw
786883
mock_table_helper.get_int_tbl = MagicMock(return_value=mock_table)
787884
mock_table_helper.get_dom_tbl = MagicMock(return_value=mock_table)
788885
mock_table_helper.get_dom_threshold_tbl = MagicMock(return_value=mock_table)
789-
stopping_event = multiprocessing.Event()
886+
stop_event = threading.Event()
887+
sfp_error_event = threading.Event()
790888
port_mapping = PortMapping()
791889
retry_eeprom_set = set()
792-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
890+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
793891
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
794892
task.xcvr_table_helper.get_status_tbl = mock_table_helper.get_status_tbl
795893
task.xcvr_table_helper.get_intf_tbl = mock_table_helper.get_intf_tbl
@@ -822,15 +920,18 @@ def test_SfpStateUpdateTask_handle_port_change_event(self, mock_update_status_hw
822920
assert not task.port_mapping.logical_to_asic
823921
assert mock_update_status_hw.call_count == 1
824922

923+
@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_config_change', MagicMock(return_value=(None, None)))
825924
def test_SfpStateUpdateTask_task_run_stop(self):
826925
port_mapping = PortMapping()
827926
retry_eeprom_set = set()
828-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
829-
sfp_error_event = multiprocessing.Event()
830-
task.task_run(sfp_error_event)
831-
assert wait_until(5, 1, task.task_process.is_alive)
832-
task.task_stop()
833-
assert wait_until(5, 1, lambda: task.task_process.is_alive() is False)
927+
stop_event = threading.Event()
928+
sfp_error_event = threading.Event()
929+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
930+
task.start()
931+
assert wait_until(5, 1, task.is_alive)
932+
task.raise_exception()
933+
task.join()
934+
assert wait_until(5, 1, lambda: task.is_alive() is False)
834935

835936
@patch('xcvrd.xcvrd.XcvrTableHelper', MagicMock())
836937
@patch('xcvrd.xcvrd.post_port_sfp_info_to_db')
@@ -841,7 +942,9 @@ def test_SfpStateUpdateTask_retry_eeprom_reading(self, mock_update_status_hw, mo
841942

842943
port_mapping = PortMapping()
843944
retry_eeprom_set = set()
844-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
945+
stop_event = threading.Event()
946+
sfp_error_event = threading.Event()
947+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
845948
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
846949
task.xcvr_table_helper.get_intf_tbl = MagicMock(return_value=mock_table)
847950
task.xcvr_table_helper.get_dom_tbl = MagicMock(return_value=mock_table)
@@ -873,7 +976,9 @@ def test_SfpStateUpdateTask_retry_eeprom_reading(self, mock_update_status_hw, mo
873976
def test_SfpStateUpdateTask_mapping_event_from_change_event(self):
874977
port_mapping = PortMapping()
875978
retry_eeprom_set = set()
876-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
979+
stop_event = threading.Event()
980+
sfp_error_event = threading.Event()
981+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
877982
port_dict = {}
878983
assert task._mapping_event_from_change_event(False, port_dict) == SYSTEM_FAIL
879984
assert port_dict[EVENT_ON_ALL_SFP] == SYSTEM_FAIL
@@ -910,10 +1015,10 @@ def test_SfpStateUpdateTask_task_worker(self, mock_post_pm_info, mock_del_status
9101015
mock_del_dom, mock_change_event, mock_mapping_event, mock_os_kill):
9111016
port_mapping = PortMapping()
9121017
retry_eeprom_set = set()
913-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
1018+
stop_event = threading.Event()
1019+
sfp_error_event = threading.Event()
1020+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
9141021
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
915-
stop_event = multiprocessing.Event()
916-
sfp_error_event = multiprocessing.Event()
9171022
mock_change_event.return_value = (True, {0: 0}, {})
9181023
mock_mapping_event.return_value = SYSTEM_NOT_READY
9191024

@@ -1036,7 +1141,9 @@ class MockTable:
10361141

10371142
port_mapping = PortMapping()
10381143
retry_eeprom_set = set()
1039-
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set)
1144+
stop_event = threading.Event()
1145+
sfp_error_event = threading.Event()
1146+
task = SfpStateUpdateTask(DEFAULT_NAMESPACE, port_mapping, retry_eeprom_set, stop_event, sfp_error_event)
10401147
task.xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
10411148
task.xcvr_table_helper.get_status_tbl = mock_table_helper.get_status_tbl
10421149
task.xcvr_table_helper.get_intf_tbl = mock_table_helper.get_intf_tbl

0 commit comments

Comments
 (0)