NotfierMgr agent coredumps when running 'hastop -local -force' following upgrade to SFHA 6.2/Solaris10 and the resource is left in W_ONLINE_REVERSE_PROPAGATE state

book

Article ID: 100015505

calendar_today

Updated On:

Description

Error Message

NotifierMgr agent coredumps shows the following stack:

::::::::::::::
8530:   /opt/
VRTSvcs/bin/NotifierMngr/NotifierMngrAgent -type NotifierMngr
-----------------  lwp# 1 / thread# 1  --------------------
 fec4e7ac pollsys  (ffbfd150, 0, ffbfd1b8, 0)
 febe9c5c pselect  (ffbfd150, fecc6790, fecc6790, 0, ffbfd1b8, 0) + 1c8
 febe9fd4 select   (0, 0, 0, 0, ffbfd284, ff291814) + a0
 ff218d44 __1cIVCSSleep6FpnHtimeval__v_(ffbfd284, 66ba8, 4f66, 4167656e, 80808080, 1010101) + 1c
 ff22748c __1cOVCSThreadYield6F_v_(56678, 82980, 64fa8, 743, 0, 64fa8) + 14
 ff13c468 main_loop (4a160, ff27f43d, ff27f8e6, 96e, 0, ff27f8f2) + 488
 ff13d998 VCSAgMain (3, ffbffcac, 0, 0, 0, 0) + 14d8
 ff13d9d0 main     (3, ffbffcac, ffbffcbc, 25800, fe850080, 0) + 18
 00011d70 _start   (0, 0, 0, 0, 0, 0) + d8
-----------------  lwp# 2 / thread# 2  --------------------
 fec4af80 lwp_park (0, 0, 0)
 fec44fc8 cond_wait_queue (45220, 4c950, 0, 0, 1c00, 0) + 4c
 fec45510 cond_wait (45220, 4c950, 0, 1c00, 9, fffc00) + 10
 fec4554c pthread_cond_wait (45220, 4c950, 0, 0, 4c950, fec42fd0) + 8
 ff224c34 __1cNVCSCondCvwait6FpnN_pthread_cond_pnO_pthread_mutex_i_v_(45220, 4c950, ffffffff, 0, fe6e0200, 80808080) + 4c
 ff224e74 __1cNVCSCondCvwait6FpnN_pthread_cond_pnNVCSLockStruct_i_v_(45220, 4c950, ffffffff,ff2a4d0e, 0, 0) + 44
 ff1c53a8 __1cDLogDdeq6Fi_pnFDList__(ffffffff, 1, ff2a4d0e, 27d4, 0, 8000) + 98
 ff1c4f84 logger_threadx_func (8000, 1, 0, 27d4, 0, 45270) + e4
 fec4aee0 _lwp_start (0, 0, 0, 0, 0, 0)
-----------------  lwp# 3 / thread# 3  --------------------
 fec4e7ac pollsys  (fe57fe58, 0, fe57fec0, 0)
 febe9c5c pselect  (fe57fe58, fecc6790, fecc6790, 0, fe57fec0, 0) + 1c8
 febe9fd4 select   (0, 0, 0, 0, fe57ff8c, ff28caf8) + a0
 ff218d44 __1cIVCSSleep6FpnHtimeval__v_(fe57ff8c, ff27f228, ff27f252, 66a, 0, ff27f25e) + 1c
 ff13b710 vcsag_timer_thread_func (ffbfd3dc, fe580000, 0, 0, ff13b610, 1) + 100
 fec4aee0 _lwp_start (0, 0, 0, 0, 0, 0)
-----------------  lwp# 4 / thread# 4  --------------------
 fec4eec8 waitid   (0, 2155, fe3f46a8, 3)
 fec3e868 waitpid  (2155, fe3f4c0c, 0, 0, fecc79a0, fe6e1200) + 60
 ff217a94 __1cZVCSCreateProcessForSignal6Fpkci_i_(fe3f6c80, a, ff2b28a9, 2152, fe3f78f4, 0) + 184
 ff21ab1c __1cMVCSDumpStack6Fpkci_v_(fe3f78f4, a, 2152, 0, ff0000, 80808080) + 7c
 ff21b608 __1cQVCSSignalHandler6Fi_v_(b, 0, 0, 0, 0, 0) + 230
 ff21b694 VCSSegvHandler (b, 0, 0, 0, 0, 0) + c
 ff13bb88 vcsag_diag_handler (b, 0, fe3f8388, 1, 0, 0) + 98
 fec4b00c __sighndlr (b, 0, fe3f8388, ff13baf0, 0, 1) + c
 fec3f6bc call_user_handler (b, 0, 4, 0, fe6e1200, fe3f8388) + 3b8
 fec3f8a4 sigacthandler (b, 0, fe3f8388, fe3f94b8, 0, 0) + 60
 --- called from signal handler with signal 11 (SIGSEGV) ---
 febb2d50 strlen   (14c19, fe3f9618, 81d7a, 0, 0, 0) + 50
 fec21c3c vsnprintf (fe3fa68c, 1000, 14c05, fe3f9608, 7ffffc00, 0) + 6c
 ff146128 VCSAgSnprintf (fe3fa68c, 1000, 14c05, fe3f968c, 80050, 81d78) + 400001348c res_online (0, 83208, ff291a29, 1795, 0, ff291a35) + bc4
 ff19070c __1cNVCSAgEPStructLcall_online6Mpkcppv_I_(6c088, fe3fdf3c, 83208, 689, 0, ff28efc5) + 34
 ff176b84 __1cJVCSAgTypeLcall_online6Mpkcp1ppv5pnOVCSAgContainer__I_(4dbf8, fe3fdb3c, fe3fdf3c, 83208, 83a28, 811a0) + 364
 ff150824 __1cIVCSAgResQcall_entry_point6MnPVCSAgEntryPoint_pvpnNVCSAgIntState__nMVCSAgRetType__(10000, 7, fe3fe504, 2c6f8, 17f9, 4000) + 261c
 ff131cb8 __1cNVCSAgIntStateOprocess_online6FpnIVCSAgRes_pnFVList__nJVCSAgBool__(6a510, 64990, 1, ff2864cc, ff2864e9, 17f9) + 340
 ff12d584 __1cOVCSAgISProbingGonline6MpnIVCSAgRes_pnFVList__nJVCSAgBool__(2c7b8, 6a510, 64990, 17f9, 0, ff2864f4) + 54
 ff15b118 __1cIVCSAgResLprocess_cmd6MpnFVList_pi_nJVCSAgBool__(6a510, 0, fe3fea9c, 1c43, 0, ff286b7d) + 5b0
 ff15d8fc __1cIVCSAgResQprocess_resource6Fp0pi_v_(6a510, fe3fef0c, 265, 251, 0, ff295364) + a4
 ff18bb40 vcsag_process_scheduler_queue (4a160, ff29527e, ff2952ec, 23a, 0, ff2952f7) + 150
 ff18b980 vcsag_service_thread_start (0, fe400000, 0, 0, ff18b740, 1) + 240
 fec4aee0 _lwp_start (0, 0, 0, 0, 0, 0)


and the engine_A.log file shows messages similar to:

2015/03/06 11:06:53 VCS NOTICE V-16-1-10300 Initiating Offline of Resource ntfr (Owner: Unspecified, Group: ClusterService) on System node02
2015/03/06 11:06:56 VCS INFO V-16-1-10305 Resource ntfr (Owner: Unspecified, Group: ClusterService) is offline on node02 (VCS initiated)
2015/03/06 13:23:19 VCS INFO V-16-1-10304 Resource ntfr (Owner: unknown, Group: ClusterService) is offline on node02 (First probe)
2015/03/06 15:51:03 VCS INFO V-16-1-10304 Resource ntfr (Owner: Unspecified, Group: ClusterService) is offline on node02 (First probe)
2015/03/06 15:51:34 VCS NOTICE V-16-1-10301 Initiating Online of Resource ntfr (Owner: Unspecified, Group: ClusterService) on System node02
2015/03/06 15:53:19 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = RESOURCE_W_ONLINE)
2015/03/06 15:57:43 VCS WARNING V-16-1-10023 Agent NotifierMngr not sending alive messages since Fri Mar  6 15:55:31 2015
...
2015/03/06 16:02:07 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = RESOURCE_W_ONLINE)
2015/03/06 16:04:19 VCS ERROR V-16-1-10009 Agent NotifierMngr has faulted 6 times in less than 950 seconds -- Will not attempt to restart. Correct the problem and use haagent -start to start the agent
2015/03/16 13:23:14 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = RESOURCE_W_ONLINE_REVERSE_PROPAGATE)
2015/03/17 13:21:40 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = 
RESOURCE_W_ONLINE_REVERSE_PROPAGATE)
2015/03/17 13:23:52 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = 
RESOURCE_W_ONLINE_REVERSE_PROPAGATE)
2015/03/17 13:26:04 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = 
RESOURCE_W_ONLINE_REVERSE_PROPAGATE)
2015/03/17 13:28:16 VCS NOTICE V-16-1-53034 Agent for type NotifierMngr is restarted. Re-sending online to resource ntfr (Previous IState = 
RESOURCE_W_ONLINE_REVERSE_PROPAGATE)

Cause

This problem was due to the missing MessageExpiryInterval attribute in the NotifierMngr type definition in the /etc/VRTSvcs/conf/config/types.cf file.

eg.
type NotifierMngr (
	static int RestartLimit = 3
	static str ArgList[] = { EngineListeningPort, MessagesQueue, NotifierListeningPort, NotifierSourceIP, SnmpdTrapPort, SnmpCommunity, SnmpConsoles, SmtpServer, SmtpServerVrfyOff, SmtpServerTimeout, SmtpReturnPath, SmtpFromPath, SmtpRecipients }
	int EngineListeningPort = 14141
	int MessagesQueue = 30
	int NotifierListeningPort = 14144
	str NotifierSourceIP
	int SnmpdTrapPort = 162
	str SnmpCommunity = public
	str SnmpConsoles{}
	str SmtpServer
	boolean SmtpServerVrfyOff = 0
	int SmtpServerTimeout = 10
	str SmtpReturnPath
	str SmtpFromPath
	str SmtpRecipients{}
)

Resolution

Once the MessageExpiryInterval attribute was added to the arglist for the NotifierMngr type definition and a value set

/etc/VRTSvcs/conf/config/types.cf

type NotifierMngr (
        static str OnlinePriority = 0
        static int RestartLimit = 3
        static str ArgList[] = { EngineListeningPort, MessagesQueue, MessageExpiryInterval, NotifierListeningPort, NotifierSourceIP, SnmpdTrapPort, SnmpCommunity, SnmpConsoles, SmtpServer, SmtpServerVrfyOff, SmtpServerTimeout, SmtpReturnPath, SmtpFromPath, SmtpRecipients }
        static boolean AEPTimeout = 1
        static keylist SupportedOperations = { OnOff }
        int MessageExpiryInterval = 3600
        boolean SmtpServerVrfyOff = 0
        str SmtpFromPath
        str SnmpCommunity = public
        str NotifierSourceIP
        str SmtpServer
        str SmtpRecipients{}
        int EngineListeningPort = 14141
        str SmtpReturnPath
        int SmtpServerTimeout = 10
        str SnmpConsoles{}
        int NotifierListeningPort = 14144
        int MessagesQueue = 30
        int SnmpdTrapPort = 162
)


the NotifierMngr no longer coredumped and everything worked as expected.






Issue/Introduction

NotfierMgr agent coredumps when running 'hastop -local -force' following upgrade to SFHA 6.2/Solaris10 and the resource is left in W_ONLINE_REVERSE_PROPAGATE state

Additional Information

ETrack: 3775659