defect 1170 - assert in memb_state_recover_enter (was) Re: [Openais] looks like the synchronization code is still broke

Steven Dake sdake at redhat.com
Wed Apr 5 16:52:05 PDT 2006


It looks like this is a crash with 5 processors.  Is that true?  Is 1170
applied?  The line number for the assert doesn't match my tree.  I also
need the output from the crash.  It contains some other valuable
information.  I also need the "low_ring_aru" variable.

A description of how this fault was generated would be helpful (assuming
1170 was applied).

Regards
-steve

On Wed, 2006-04-05 at 18:42 +0200, Fabien THOMAS wrote:
> #0  0x28187723 in kill () from /lib/libc.so.6
> [New LWP 100091]
> (gdb) bt
> #0  0x28187723 in kill () from /lib/libc.so.6
> #1  0x280b61da in raise () from /usr/lib/libpthread.so.2
> #2  0x281863d4 in abort () from /lib/libc.so.6
> #3  0x28164358 in __assert () from /lib/libc.so.6
> #4  0x08051814 in memb_state_recovery_enter (instance=0x83c6000,  
> commit_token=0x83e2650)
>      at totemsrp.c:1626
> #5  0x08056944 in message_handler_memb_commit_token  
> (instance=0x83c6000, system_from=0x3fbfeb90,
>      msg=0x83e2650, msg_len=2102, endian_conversion_needed=0) at  
> totemsrp.c:3583
> #6  0x08056af1 in main_deliver_fn (context=0x83c6000,  
> system_from=0x3fbfeb90, msg=0x83e2650,
>      msg_len=2102) at totemsrp.c:3642
> #7  0x0804df12 in active_mcast_recv (instance=0x83b4700,  
> context=0x83c6000,
>      system_from=0x3fbfeb90, msg=0x83e2650, msg_len=2102) at  
> totemrrp.c:393
> #8  0x0804e2be in rrp_deliver_fn (context=0x83b5670,  
> system_from=0x3fbfeb90, msg=0x83e2650,
>      msg_len=2102) at totemrrp.c:549
> #9  0x0804c3b6 in net_deliver_fn (handle=0, fd=8, revents=1,  
> data=0x83e2000, prio=0x83c1454)
>      at totemnet.c:687
> #10 0x0804ab76 in poll_run (handle=0) at aispoll.c:424
> #11 0x0805f9cb in main (argc=1, argv=0x3fbfee88) at main.c:1317
> (gdb) frame 4
> #4  0x08051814 in memb_state_recovery_enter (instance=0x83c6000,  
> commit_token=0x83e2650)
>      at totemsrp.c:1626
> 1626    totemsrp.c: No such file or directory.
>          in totemsrp.c
> (gdb) print *instance
> $1 = {first_run = 1, fcc_remcast_last = 0, fcc_mcast_last = 0,  
> fcc_mcast_current = 0,
>    fcc_remcast_current = 0, consensus_list = {{addr = {nodeid =  
> 117506570, family = 2,
>          addr = "\n\002\001\a", '\0' <repeats 11 times>}, set = 1},  
> {addr = {nodeid = 84607498,
>          family = 2, addr = "\n\002\v\005\000\000n?\004\bL1;\b? 
> \026"}, set = 1}, {addr = {
>          nodeid = 4263051786, family = 2, addr = "\n\002\031?\000 
> \000n?\004\bL1;\b?\026"},
>        set = 1}, {addr = {nodeid = 4262724106, family = 2,
>          addr = "\n\002\024?\000\000n?\004\bL1;\b?\026"}, set = 1},  
> {addr = {nodeid = 100729354,
>          family = 2, addr = "\n\002\001\006??W\237\005\bL1;\b@ "},  
> set = 1}, {addr = {nodeid = 0,
>          family = 0, addr = '\0' <repeats 15 times>}, set = 0}  
> <repeats 27 times>},
>    consensus_list_entries = 4, my_proc_list = {{nodeid = 117506570,  
> family = 2,
>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, {nodeid =  
> 84607498, family = 2,
>        addr = "\n\002\v\005??W\237\005\bL1;\b?4"}, {nodeid =  
> 4263051786, family = 2,
>        addr = "\n\002\031???W\237\005\bL1;\b@?"}, {nodeid =  
> 4262724106, family = 2,
>        addr = "\n\002\024?\000\000n?\004\bL1;\b?\026"}, {nodeid =  
> 100729354, family = 2,
>        addr = "\n\002\001\006", '\0' <repeats 11 times>}, {nodeid =  
> 0, family = 0,
>        addr = '\0' <repeats 15 times>} <repeats 27 times>},  
> my_failed_list = {{nodeid = 100729354,
>        family = 2, addr = "\n\002\001\006??W\237\005\bL1;\b?\024"},  
> {nodeid = 4262724106,
>        family = 2, addr = "\n\002\024?", '\0' <repeats 11 times>},  
> {nodeid = 4262724106,
>        family = 2, addr = "\n\002\024?", '\0' <repeats 11 times>},  
> {nodeid = 0, family = 0,
>        addr = '\0' <repeats 15 times>} <repeats 29 times>},  
> my_new_memb_list = {{
>        nodeid = 100729354, family = 2, addr = "\n\002\001\006", '\0'  
> <repeats 11 times>}, {
>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'  
> <repeats 11 times>}, {
>        nodeid = 84607498, family = 2, addr = "\n\002\v\005??W\237\005 
> \bL1;\b?4"}, {
>        nodeid = 4262724106, family = 2, addr = "\n\002\024?", '\0'  
> <repeats 11 times>}, {
> ---Type <return> to continue, or q <return> to quit---
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {nodeid = 0,
>        family = 0, addr = '\0' <repeats 15 times>} <repeats 27  
> times>}, my_trans_memb_list = {{
>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'  
> <repeats 11 times>}, {
>        nodeid = 84607498, family = 2, addr = "\n\002\v\005??W\237\005 
> \bL1;\b?4"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {nodeid = 0,
>        family = 0, addr = '\0' <repeats 15 times>} <repeats 28  
> times>}, my_memb_list = {{
>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'  
> <repeats 11 times>}, {
>        nodeid = 84607498, family = 2, addr = "\n\002\v\005??W\237\005 
> \bL1;\b?4"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {nodeid = 0,
>        family = 0, addr = '\0' <repeats 15 times>} <repeats 28  
> times>}, my_deliver_memb_list = {{
>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'  
> <repeats 11 times>}, {
>        nodeid = 84607498, family = 2, addr = "\n\002\v\005??W\237\005 
> \bL1;\b?4"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {nodeid = 0,
>        family = 0, addr = '\0' <repeats 15 times>} <repeats 28  
> times>}, my_nodeid_lookup_list = {{
>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'  
> <repeats 11 times>}, {
>        nodeid = 100729354, family = 2, addr = "\n\002\001\006", '\0'  
> <repeats 11 times>}, {
>        nodeid = 84607498, family = 2, addr = "\n\002\v\005", '\0'  
> <repeats 11 times>}, {
>        nodeid = 4262724106, family = 2, addr = "\n\002\024?", '\0'  
> <repeats 11 times>}, {
>        nodeid = 4263051786, family = 2, addr = "\n\002\031???W\237\005 
> \bL1;\b@?"}, {nodeid = 0,
>        family = 0, addr = '\0' <repeats 15 times>} <repeats 27  
> times>}, my_proc_list_entries = 5,
>    my_failed_list_entries = 0, my_new_memb_entries = 5,  
> my_trans_memb_entries = 3,
> ---Type <return> to continue, or q <return> to quit---
>    my_memb_entries = 3, my_deliver_memb_entries = 3,  
> my_nodeid_lookup_entries = 5, my_ring_id = {
>      rep = {nodeid = 100729354, family = 2, addr = "\n\002\001\006",  
> '\0' <repeats 11 times>},
>      seq = 68996}, my_old_ring_id = {rep = {nodeid = 117506570,  
> family = 2,
>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, seq = 68992},  
> my_aru_count = 0,
>    my_merge_detect_timeout_outstanding = 0, my_last_aru = 0,  
> my_seq_unchanged = 0,
>    my_received_flg = 0, my_high_seq_received = 0, my_install_seq = 0,  
> my_rotation_counter = 0,
>    my_set_retrans_flg = 0, my_retrans_flg_count = 0,  
> my_high_ring_delivered = 0,
>    heartbeat_timeout = 764, new_message_queue = {head = 99, tail =  
> 104, used = 175, usedhw = 175,
>      size = 181, items = 0x83e7000, size_per_item = 48, iterator =  
> 0}, retrans_message_queue = {
>      head = 0, tail = 499, used = 0, usedhw = 0, size = 500, items =  
> 0x83ce000, size_per_item = 48,
>      iterator = 0}, regular_sort_queue = {head = 0, size = 256, items  
> = 0x83d4000,
>      items_inuse = 0x83c0c00, size_per_item = 44, head_seqid = 0,  
> item_count = 256, pos_max = 0},
>    recovery_sort_queue = {head = 0, size = 256, items = 0x83d7000,  
> items_inuse = 0x83c4000,
>      size_per_item = 44, head_seqid = 0, item_count = 256, pos_max =  
> 0}, my_aru = 0,
>    my_high_delivered = 0, token_callback_received_listhead = {next =  
> 0x83b3440, prev = 0x83b3440},
>    token_callback_sent_listhead = {next = 0x83c77f0, prev = 0x83c77f0},
>    orf_token_retransmit = 0x83ca000 "", orf_token_retransmit_size =  
> 82, my_token_seq = 4294967295,
>    timer_orf_token_timeout = 0x85815e0,  
> timer_orf_token_retransmit_timeout = 0x0,
>    timer_orf_token_hold_retransmit_timeout = 0x0,  
> timer_merge_detect_timeout = 0x0,
>    memb_timer_state_gather_join_timeout = 0x0,  
> memb_timer_state_gather_consensus_timeout = 0x0,
>    memb_timer_state_commit_timeout = 0x0, timer_heartbeat_timeout =  
> 0x85816c0,
>    totemsrp_log_level_security = 65538, totemsrp_log_level_error =  
> 131074,
>    totemsrp_log_level_warning = 196610, totemsrp_log_level_notice =  
> 262146,
>    totemsrp_log_level_debug = 327682, totemsrp_log_printf = 0x805fb7c  
> <internal_log_printf>,
> ---Type <return> to continue, or q <return> to quit---
>    memb_state = MEMB_STATE_COMMIT, my_id = {nodeid = 117506570,  
> family = 2,
>      addr = "\n\002\001\a", '\0' <repeats 11 times>}, next_memb =  
> {nodeid = 84607498, family = 2,
>      addr = "\n\002\v\005??W\237\005\bL1;\b?4"}, iov_buffer = '\0'  
> <repeats 8999 times>,
>    totemsrp_iov_recv = {iov_base = 0x0, iov_len = 0},  
> totemsrp_poll_handle = 0, totemsrp_recv = 0,
>    mcast_address = {nodeid = 0, family = 2, addr = "?^\001\002", '\0'  
> <repeats 11 times>},
>    totemsrp_deliver_fn = 0x8056c08 <totemmrp_deliver_fn>,
>    totemsrp_confchg_fn = 0x8056c3c <totemmrp_confchg_fn>,  
> global_seqno = 105, my_token_held = 0,
>    token_ring_id_seq = 68996, last_released = 0, set_aru =  
> 4294967295, old_ring_state_saved = 1,
>    old_ring_state_aru = 0, old_ring_state_high_seq_received = 0,  
> ring_saved = 1, my_last_seq = 30,
>    tv_old = {tv_sec = 0, tv_usec = 0}, totemrrp_handle = 0,  
> totem_config = 0x3fbfed14,
>    use_heartbeat = 1}
> (gdb) print commit_token
> $2 = (struct memb_commit_token *) 0x83e2650
> (gdb) print *commit_token
> $3 = {header = {type = 4 '\004', encapsulated = 0 '\0',  
> endian_detector = 65314,
>      nodeid = 4263051786}, token_seq = 7, ring_id = {rep = {nodeid =  
> 100729354, family = 2,
>        addr = "\n\002\001\006", '\0' <repeats 11 times>}, seq =  
> 68996}, retrans_flg = 0,
>    memb_index = 1, addr_entries = 5, addr = {{nodeid = 100729354,  
> family = 2,
>        addr = "\n\002\001\006", '\0' <repeats 11 times>}, {nodeid =  
> 117506570, family = 2,
>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, {nodeid =  
> 84607498, family = 2,
>        addr = "\n\002\v\005??W\237\005\bL1;\b?4"}, {nodeid =  
> 4262724106, family = 2,
>        addr = "\n\002\024?", '\0' <repeats 11 times>}, {nodeid =  
> 4263051786, family = 2,
>        addr = "\n\002\031???W\237\005\bL1;\b@?"}, {nodeid = 0, family  
> = 0,
>        addr = '\0' <repeats 15 times>} <repeats 27 times>}, memb_list  
> = {{ring_id = {rep = {
>            nodeid = 100729354, family = 2, addr = "\n\002\001\006",  
> '\0' <repeats 11 times>},
>          seq = 68992}, aru = 30, high_delivered = 2, received_flg =  
> 0}, {ring_id = {rep = {
>            nodeid = 117506570, family = 2, addr = "\n\002\001\a",  
> '\0' <repeats 11 times>},
>          seq = 68992}, aru = 0, high_delivered = 0, received_flg =  
> 0}, {ring_id = {rep = {
>            nodeid = 117506570, family = 2, addr = "\n\002\001\a",  
> '\0' <repeats 11 times>},
>          seq = 68992}, aru = 15, high_delivered = 0, received_flg =  
> 0}, {ring_id = {rep = {
>            nodeid = 100729354, family = 2, addr = "\n\002\001\006",  
> '\0' <repeats 11 times>},
>          seq = 68992}, aru = 45, high_delivered = 15, received_flg =  
> 0}, {ring_id = {rep = {
>            nodeid = 117506570, family = 2, addr = "\n\002\001\a",  
> '\0' <repeats 11 times>},
>          seq = 68992}, aru = 30, high_delivered = 0, received_flg =  
> 0}, {ring_id = {rep = {
>            nodeid = 0, family = 0, addr = '\0' <repeats 15 times>},  
> seq = 0}, aru = 0,
>        high_delivered = 0, received_flg = 0} <repeats 27 times>}}
> (gdb)
> 
> Le 4 avr. 06 à 10:22, Steven Dake a écrit :
> 
> > Fabien,
> >
> > Please try this patch and see if it solves your problem.  It should  
> > but
> > I am not able to duplicate the assert.  There was a programming error
> > (few) in the protocol around handling of the recovery state.
> >
> > Regards
> > -steve
> >
> > On Wed, 2006-03-29 at 09:51 +0200, Fabien THOMAS wrote:
> >> First launch with 4 nodes: one node crashed.
> >>
> >> I think it is not related to your patch because i've seen this crash
> >> before. :(
> >>
> >> (gdb) bt
> >> #0  0x28187723 in kill () from /lib/libc.so.6
> >> #1  0x280b61da in raise () from /usr/lib/libpthread.so.2
> >> #2  0x281863d4 in abort () from /lib/libc.so.6
> >> #3  0x28164358 in __assert () from /lib/libc.so.6
> >> #4  0x0805179b in memb_state_recovery_enter (instance=0x83c6000,
> >> commit_token=0x83e0650)
> >>      at totemsrp.c:1616
> >> #5  0x08056944 in message_handler_memb_commit_token
> >> (instance=0x83c6000, system_from=0x3fbfea00,
> >>      msg=0x83e0650, msg_len=2102, endian_conversion_needed=0) at
> >> totemsrp.c:3576
> >> #6  0x08056af1 in main_deliver_fn (context=0x83c6000,
> >> system_from=0x3fbfea00, msg=0x83e0650,
> >>      msg_len=2102) at totemsrp.c:3635
> >> #7  0x0804df5e in active_mcast_recv (instance=0x83b4680,
> >> context=0x83c6000,
> >>      system_from=0x3fbfea00, msg=0x83e0650, msg_len=2102) at
> >> totemrrp.c:393
> >> #8  0x0804e30a in rrp_deliver_fn (context=0x83b55f0,
> >> system_from=0x3fbfea00, msg=0x83e0650,
> >>      msg_len=2102) at totemrrp.c:549
> >> #9  0x0804c402 in net_deliver_fn (handle=0, fd=6, revents=1,
> >> data=0x83e0000, prio=0x83b4894)
> >>      at totemnet.c:687
> >> #10 0x0804abc2 in poll_run (handle=0) at aispoll.c:424
> >> #11 0x0805fa37 in main (argc=1, argv=0x3fbfecfc) at main.c:1313
> >> (gdb) frame 4
> >> #4  0x0805179b in memb_state_recovery_enter (instance=0x83c6000,
> >> commit_token=0x83e0650)
> >>      at totemsrp.c:1616
> >> 1616    totemsrp.c: No such file or directory.
> >>          in totemsrp.c
> >> (gdb) print range
> >> $1 = 4294967285
> >> (gdb) print *instance
> >> $2 = {first_run = 1, fcc_remcast_last = 0, fcc_mcast_last = 0,
> >> fcc_mcast_current = 0,
> >>    fcc_remcast_current = 0, consensus_list = {{addr = {nodeid =
> >> 117506570, family = 2,
> >>          addr = "\n\002\001\a\000\000\000\000\000\000\r;*D\030?"},
> >> set = 1}, {addr = {
> >>          nodeid = 4263051786, family = 2, addr = "\n\002\031?\000\000
> >> \000\000\000\000\r;*D\030?"},
> >>        set = 1}, {addr = {nodeid = 100729354, family = 2,
> >>          addr = "\n\002\001\006??W\237\005\bL1;\b 1"}, set = 1},
> >> {addr = {nodeid = 0, family = 0,
> >>          addr = '\0' <repeats 15 times>}, set = 0} <repeats 29
> >> times>}, consensus_list_entries = 2,
> >>    my_proc_list = {{nodeid = 117506570, family = 2,
> >>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, {nodeid =
> >> 4263051786, family = 2,
> >>        addr = "\n\002\031?", '\0' <repeats 11 times>}, {nodeid =
> >> 100729354, family = 2,
> >>        addr = "\n\002\001\006", '\0' <repeats 11 times>}, {nodeid =
> >> 0, family = 0,
> >>        addr = '\0' <repeats 15 times>} <repeats 29 times>},
> >> my_failed_list = {{nodeid = 100729354,
> >>        family = 2, addr = "\n\002\001\006", '\0' <repeats 11 times>},
> >> {nodeid = 0, family = 0,
> >>        addr = '\0' <repeats 15 times>} <repeats 31 times>},
> >> my_new_memb_list = {{
> >>        nodeid = 100729354, family = 2, addr = "\n\002\001\006", '\0'
> >> <repeats 11 times>}, {
> >>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'
> >> <repeats 11 times>}, {
> >>        nodeid = 4263051786, family = 2, addr = "\n\002\031?", '\0'
> >> <repeats 11 times>}, {
> >>        nodeid = 0, family = 0, addr = '\0' <repeats 15 times>}
> >> <repeats 29 times>},
> >>    my_trans_memb_list = {{nodeid = 117506570, family = 2,
> >>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, {nodeid =
> >> 4263051786, family = 2,
> >>        addr = "\n\002\031?", '\0' <repeats 11 times>}, {nodeid = 0,
> >> family = 0,
> >>        addr = '\0' <repeats 15 times>} <repeats 30 times>},
> >> my_memb_list = {{nodeid = 117506570,
> >>        family = 2, addr = "\n\002\001\a", '\0' <repeats 11 times>},
> >> {nodeid = 4263051786,
> >>        family = 2, addr = "\n\002\031?", '\0' <repeats 11 times>},
> >> {nodeid = 0, family = 0,
> >> ---Type <return> to continue, or q <return> to quit---
> >>        addr = '\0' <repeats 15 times>} <repeats 30 times>},
> >> my_deliver_memb_list = {{
> >>        nodeid = 117506570, family = 2, addr = "\n\002\001\a", '\0'
> >> <repeats 11 times>}, {
> >>        nodeid = 4263051786, family = 2, addr = "\n\002\031?", '\0'
> >> <repeats 11 times>}, {
> >>        nodeid = 0, family = 0, addr = '\0' <repeats 15 times>}
> >> <repeats 30 times>},
> >>    my_nodeid_lookup_list = {{nodeid = 117506570, family = 2,
> >>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, {nodeid =
> >> 4263051786, family = 2,
> >>        addr = "\n\002\031?", '\0' <repeats 11 times>}, {nodeid =
> >> 100729354, family = 2,
> >>        addr = "\n\002\001\006", '\0' <repeats 11 times>}, {nodeid =
> >> 0, family = 0,
> >>        addr = '\0' <repeats 15 times>} <repeats 29 times>},
> >> my_proc_list_entries = 3,
> >>    my_failed_list_entries = 0, my_new_memb_entries = 3,
> >> my_trans_memb_entries = 2,
> >>    my_memb_entries = 2, my_deliver_memb_entries = 2,
> >> my_nodeid_lookup_entries = 3, my_ring_id = {
> >>      rep = {nodeid = 100729354, family = 2, addr = "\n\002\001\006",
> >> '\0' <repeats 11 times>},
> >>      seq = 33764}, my_old_ring_id = {rep = {nodeid = 117506570,
> >> family = 2,
> >>        addr = "\n\002\001\a", '\0' <repeats 11 times>}, seq = 33756},
> >> my_aru_count = 0,
> >>    my_merge_detect_timeout_outstanding = 0, my_last_aru = 0,
> >> my_seq_unchanged = 0,
> >>    my_received_flg = 1, my_high_seq_received = 4, my_install_seq = 0,
> >> my_rotation_counter = 0,
> >>    my_set_retrans_flg = 0, my_retrans_flg_count = 0,
> >> my_high_ring_delivered = 0,
> >>    heartbeat_timeout = 0, new_message_queue = {head = 78, tail = 41,
> >> used = 36, usedhw = 36,
> >>      size = 181, items = 0x83e5000, size_per_item = 48, iterator =
> >> 0}, retrans_message_queue = {
> >>      head = 0, tail = 499, used = 0, usedhw = 0, size = 500, items =
> >> 0x83ce000, size_per_item = 48,
> >>      iterator = 0}, regular_sort_queue = {head = 0, size = 256, items
> >> = 0x83c3000,
> >>      items_inuse = 0x83c0c00, size_per_item = 44, head_seqid = 0,
> >> item_count = 256, pos_max = 4},
> >>    recovery_sort_queue = {head = 0, size = 256, items = 0x83d4000,
> >> items_inuse = 0x83d7000,
> >>      size_per_item = 44, head_seqid = 0, item_count = 256, pos_max =
> >> 0}, my_aru = 0,
> >> ---Type <return> to continue, or q <return> to quit---
> >>    my_high_delivered = 0, token_callback_received_listhead = {next =
> >> 0x83b3420, prev = 0x83b3420},
> >>    token_callback_sent_listhead = {next = 0x83c77f0, prev =  
> >> 0x83c77f0},
> >>    orf_token_retransmit = 0x83ca000 "", orf_token_retransmit_size =
> >> 82, my_token_seq = 4294967295,
> >>    timer_orf_token_timeout = 0x83b3480,
> >> timer_orf_token_retransmit_timeout = 0x83b34a0,
> >>    timer_orf_token_hold_retransmit_timeout = 0x0,
> >> timer_merge_detect_timeout = 0x0,
> >>    memb_timer_state_gather_join_timeout = 0x0,
> >> memb_timer_state_gather_consensus_timeout = 0x0,
> >>    memb_timer_state_commit_timeout = 0x0, timer_heartbeat_timeout  
> >> = 0x0,
> >>    totemsrp_log_level_security = 65538, totemsrp_log_level_error =
> >> 131074,
> >>    totemsrp_log_level_warning = 196610, totemsrp_log_level_notice =
> >> 262146,
> >>    totemsrp_log_level_debug = 327682, totemsrp_log_printf = 0x805fbe8
> >> <internal_log_printf>,
> >>    memb_state = MEMB_STATE_COMMIT, my_id = {nodeid = 117506570,
> >> family = 2,
> >>      addr = "\n\002\001\a", '\0' <repeats 11 times>}, next_memb =
> >> {nodeid = 4263051786, family = 2,
> >>      addr = "\n\002\031?", '\0' <repeats 11 times>}, iov_buffer =
> >> '\0' <repeats 8999 times>,
> >>    totemsrp_iov_recv = {iov_base = 0x0, iov_len = 0},
> >> totemsrp_poll_handle = 0, totemsrp_recv = 0,
> >>    mcast_address = {nodeid = 0, family = 2, addr = "?^\001\001", '\0'
> >> <repeats 11 times>},
> >>    totemsrp_deliver_fn = 0x8056c08 <totemmrp_deliver_fn>,
> >>    totemsrp_confchg_fn = 0x8056c3c <totemmrp_confchg_fn>,
> >> global_seqno = 223, my_token_held = 0,
> >>    token_ring_id_seq = 33764, last_released = 0, set_aru =
> >> 4294967295, old_ring_state_saved = 1,
> >>    old_ring_state_aru = 0, old_ring_state_high_seq_received = 4,
> >> ring_saved = 1, my_last_seq = 15,
> >>    tv_old = {tv_sec = 0, tv_usec = 0}, totemrrp_handle = 0,
> >> totem_config = 0x3fbfeb84,
> >>    use_heartbeat = 0}
> >>
> >>
> >> Mar 29  7:45:09 [NOTICE  ] [MAIN ] AIS Executive Service: Copyright
> >> (C) 2002-2006 MontaVista Software, Inc. and contributors.
> >> Mar 29  7:45:09 [WARNING ] [MAIN ] Could not lock memory of service
> >> to avoid page faults
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Token Timeout (1000 ms) retransmit
> >> timeout (238 ms)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] token hold (180 ms) retransmits
> >> before loss (4 retrans)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] join (100 ms) consensus (200 ms)
> >> merge (200 ms)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] downcheck (1000 ms) fail to recv
> >> const (50 msgs)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] seqno unchanged const (30
> >> rotations) Maximum network MTU 1500
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] send threads (0 threads)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] heartbeat_failures_allowed (0)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] max_network_delay (50 ms)
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] HeartBeat is Disabled. To enable
> >> set heartbeat_failures_allowed > 0
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Receive multicast socket recv
> >> buffer size (144000 bytes).
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Transmit multicast socket send
> >> buffer size (144000 bytes).
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] The network interface [10.2.1.7]
> >> is now up.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Created or loaded sequence id
> >> 33740.10.2.1.7 for this ring.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_cpg  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais cluster closed process group service v1.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais cluster closed process group service v1.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_cfg  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais configuration service'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais configuration service'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_msg  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais message service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais message service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_lck  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais distributed locking service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais distributed locking service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_evt  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais event service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais event service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_ckpt
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais checkpoint service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais checkpoint service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_amf  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais availability management framework B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais availability management framework B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_clm  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais cluster membership service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais cluster membership service B.01.01'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] openais component openais_evs  
> >> loaded.
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Registering service handler
> >> 'openais extended virtual synchrony service'
> >> Mar 29  7:45:09 [NOTICE  ] [SERV ] Initializing service handler
> >> 'openais extended virtual synchrony service'
> >> Mar 29  7:45:09 [NOTICE  ] [MAIN ] AIS Executive Service: started and
> >> ready to receive connections.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Creating commit token because I am
> >> the rep.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Saving state aru 0 high seq
> >> received 0
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33744
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] position [0] member 10.2.1.7:
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] previous ring seq 33740 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] aru 0 high delivered 0 received
> >> flag 1
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] copying all old ring messages from
> >> 1-0.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Originated 0 messages in RECOVERY.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Originated for recovery:
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Not Originated for recovery:
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Sending initial ORF token
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:09 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:09 [NOTICE  ] [SYNC ] This node is within the non-
> >> primary component and will NOT provide any services.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] entering OPERATIONAL state.
> >> Mar 29  7:45:09 [NOTICE  ] [YKD  ] This processor is within the
> >> primary component.
> >> Mar 29  7:45:09 [NOTICE  ] [SYNC ] This node is within the primary
> >> component and will provide service.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Creating commit token because I am
> >> the rep.
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Saving state aru 25 high seq
> >> received 25
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33748
> >> Mar 29  7:45:09 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] The token was lost in state 3 from
> >> timer 83c6000
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Creating commit token because I am
> >> the rep.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33752
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] position [0] member 10.2.1.7:
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] previous ring seq 33744 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] aru 25 high delivered 24 received
> >> flag 1
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] position [1] member 10.2.25.254:
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] previous ring seq 33748 rep
> >> 10.2.25.254
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] aru 2b high delivered 2b received
> >> flag 1
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] copying all old ring messages from
> >> 26-25.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Originated 0 messages in RECOVERY.
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Originated for recovery:
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Not Originated for recovery:
> >> Mar 29  7:45:10 [NOTICE  ] [TOTEM] Sending initial ORF token
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ]      10.2.25.254
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ]      10.2.25.254
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] This node is within the non-
> >> primary component and will NOT provide any services.
> >> Mar 29  7:45:11 [NOTICE  ] [TOTEM] entering OPERATIONAL state.
> >> Mar 29  7:45:11 [NOTICE  ] [YKD  ] This processor is within the
> >> primary component.
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] This node is within the primary
> >> component and will provide service.
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization barrier completed
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions starting
> >> for (openais cluster membership service B.01.01)
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions done for
> >> (openais cluster membership service B.01.01)
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] got nodejoin message 10.2.1.7
> >> Mar 29  7:45:11 [NOTICE  ] [CLM  ] got nodejoin message 10.2.25.254
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization barrier completed
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions starting
> >> for (openais checkpoint service B.01.01)
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions done for
> >> (openais checkpoint service B.01.01)
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization barrier completed
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions starting
> >> for (openais event service B.01.01)
> >> Mar 29  7:45:11 [NOTICE  ] [SYNC ] Synchronization actions done for
> >> (openais event service B.01.01)
> >> Mar 29  7:45:15 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] The token was lost in state 2 from
> >> timer 83c6000
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Creating commit token because I am
> >> the rep.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Saving state aru e5 high seq
> >> received e5
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33756
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] position [0] member 10.2.1.7:
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] previous ring seq 33752 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] aru e5 high delivered e5 received
> >> flag 1
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] position [1] member 10.2.25.254:
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] previous ring seq 33752 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] aru e5 high delivered e5 received
> >> flag 1
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] copying all old ring messages from
> >> e6-e5.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Originated 0 messages in RECOVERY.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Originated for recovery:
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Not Originated for recovery:
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Sending initial ORF token
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ]      10.2.25.254
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] New Configuration:
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ]      10.2.1.7
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ]      10.2.25.254
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] Members Left:
> >> Mar 29  7:45:16 [NOTICE  ] [CLM  ] Members Joined:
> >> Mar 29  7:45:16 [NOTICE  ] [SYNC ] This node is within the non-
> >> primary component and will NOT provide any services.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering OPERATIONAL state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Saving state aru 0 high seq
> >> received 4
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33760
> >> Mar 29  7:45:16 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] The token was lost in state 3 from
> >> timer 83c6000
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] entering GATHER state.
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] Storing new sequence id for ring
> >> 33764
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] entering COMMIT state.
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] position [0] member 10.2.1.6:
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] previous ring seq 33748 rep  
> >> 10.2.1.6
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] aru 23 high delivered 0 received
> >> flag 1
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] position [1] member 10.2.1.7:
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] previous ring seq 33756 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] aru 0 high delivered 0 received
> >> flag 1
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] position [2] member 10.2.25.254:
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] previous ring seq 33756 rep  
> >> 10.2.1.7
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] aru f high delivered 0 received
> >> flag 1
> >> Mar 29  7:45:17 [NOTICE  ] [TOTEM] copying all old ring messages from
> >> 10-4.
> >> Assertion failed: (range < 1024), function memb_state_recovery_enter,
> >> file totemsrp.c, line 1616.
> >>
> >> Le 28 mars 06 à 22:15, Steven Dake a écrit :
> >>
> >>> Find attached a patch which I think will fix one of the problem(s).
> >>>
> >>> I think I see the problem here at least with this debug log output.
> >>>
> >>> A synchronization is taking place and then the other node starts
> >>> interrupting the synchronization.  But there are still  
> >>> synchronization
> >>> messages that are taking place.
> >>>
> >>> The sync service should ignore sync messages if the ring id under
> >>> which
> >>> they were originated is not the same ring id delivered in the last
> >>> configuration change message.  Remember it is possible for those
> >>> recovery messages to sit queued.  This is yet another reason why we
> >>> need
> >>> flushed totem.
> >>>
> >>> Regards
> >>> -steve
> >>>
> >>>
> >>> On Tue, 2006-03-28 at 11:29 -0700, Steven Dake wrote:
> >>>> mark this event error only comes when the sync code is broken  
> >>>> right?
> >>>>
> >>>> Regards
> >>>> -steve
> >>>> email message attachment, "Forwarded message - [Bug 1153] Ramdom
> >>>> crash
> >>>> when 2nd instance of aisexec is launched"
> >>>> On Tue, 2006-03-28 at 11:29 -0700, Steven Dake wrote:
> >>>>> http://www.osdl.org/developer_bugzilla/show_bug.cgi?id=1153
> >>>>>
> >>>>>
> >>>>>
> >>>>>
> >>>>>
> >>>>> ------- Additional Comments From fabien.thomas at netasq.com
> >>>>> 2006-03-28 05:06 -------
> >>>>> i have more information, it seems that we have a race condition
> >>>>> somewhere:
> >>>>> i've two device one VIA eden at 400MHZ and one VIA eden at 800HMZ
> >>>>> when the slow device is launched first the fast device crash very
> >>>>> often
> >>>>> when the fast device is launched first the slow device can
> >>>>> connect to the cluster without problems.
> >>>>>
> >>>>> maybe it can help to understand the problem...
> >>>>>
> >>>>> here another trace smaller than the previous:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [MAIN ] AIS Executive Service:
> >>>>> Copyright (C) 2002-2006 MontaVista
> >>>>> Software, Inc. and contributors.
> >>>>> Mar 28 13:04:52 [WARNING ] [MAIN ] Could not lock memory of
> >>>>> service to avoid page faults
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Token Timeout (1000 ms)
> >>>>> retransmit timeout (238 ms)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] token hold (180 ms)
> >>>>> retransmits before loss (4 retrans)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] join (100 ms) consensus (200
> >>>>> ms) merge (200 ms)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] downcheck (1000 ms) fail to
> >>>>> recv const (50 msgs)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] seqno unchanged const (30
> >>>>> rotations) Maximum network MTU
> >>>>> 1500
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] send threads (0 threads)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] heartbeat_failures_allowed (0)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] max_network_delay (50 ms)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] HeartBeat is Disabled. To
> >>>>> enable set heartbeat_failures_allowed >
> >>>>> 0
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Receive multicast socket recv
> >>>>> buffer size (144000 bytes).
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Transmit multicast socket send
> >>>>> buffer size (144000 bytes).
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] The network interface
> >>>>> [10.2.1.7] is now up.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Created or loaded sequence id
> >>>>> 5444.10.2.1.7 for this ring.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering GATHER state.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_cpg
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais cluster closed process group
> >>>>> service v1.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais cluster closed process group
> >>>>> service v1.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_cfg
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais configuration service'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais configuration service'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_msg
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais message service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais message service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_lck
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais distributed locking service B.
> >>>>> 01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais distributed locking service B.
> >>>>> 01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_evt
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais event service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais event service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_ckpt
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais checkpoint service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais checkpoint service B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_amf
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais availability management
> >>>>> framework B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais availability management
> >>>>> framework B.01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_clm
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais cluster membership service B.
> >>>>> 01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais cluster membership service B.
> >>>>> 01.01'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] openais component openais_evs
> >>>>> loaded.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Registering service handler
> >>>>> 'openais extended virtual synchrony
> >>>>> service'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SERV ] Initializing service handler
> >>>>> 'openais extended virtual synchrony
> >>>>> service'
> >>>>> Mar 28 13:04:52 [NOTICE  ] [MAIN ] AIS Executive Service: started
> >>>>> and ready to receive connections.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Creating commit token because
> >>>>> I am the rep.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Saving state aru 0 high seq
> >>>>> received 0
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Storing new sequence id for
> >>>>> ring 5448
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering COMMIT state.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] position [0] member 10.2.1.7:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] previous ring seq 5444 rep
> >>>>> 10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] aru 0 high delivered 0
> >>>>> received flag 1
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] copying all old ring messages
> >>>>> from 1-0.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Originated 0 messages in
> >>>>> RECOVERY.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Originated for recovery:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Not Originated for recovery:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Sending initial ORF token
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] New Configuration:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Left:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Joined:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] New Configuration:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Left:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Joined:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] This node is within the non-
> >>>>> primary component and will NOT
> >>>>> provide any services.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering OPERATIONAL state.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [YKD  ] This processor is within the
> >>>>> primary component.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] This node is within the
> >>>>> primary component and will provide service.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization barrier  
> >>>>> completed
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization actions
> >>>>> starting for (openais cluster membership
> >>>>> service B.01.01)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization actions done
> >>>>> for (openais cluster membership
> >>>>> service B.01.01)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] got nodejoin message 10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization barrier  
> >>>>> completed
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization actions
> >>>>> starting for (openais checkpoint service B.
> >>>>> 01.01)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization actions done
> >>>>> for (openais checkpoint service B.
> >>>>> 01.01)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering GATHER state.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Saving state aru 28 high seq
> >>>>> received 28
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Storing new sequence id for
> >>>>> ring 5452
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering COMMIT state.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] position [0] member 10.2.1.6:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] previous ring seq 5448 rep
> >>>>> 10.2.1.6
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] aru 2b high delivered 2b
> >>>>> received flag 1
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] position [1] member 10.2.1.7:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] previous ring seq 5448 rep
> >>>>> 10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] aru 28 high delivered 27
> >>>>> received flag 1
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] copying all old ring messages
> >>>>> from 29-28.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Originated 0 messages in
> >>>>> RECOVERY.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Originated for recovery:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] Not Originated for recovery:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization barrier  
> >>>>> completed
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] Synchronization actions
> >>>>> starting for (openais event service B.01.01)
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] New Configuration:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Left:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Joined:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] CLM CONFIGURATION CHANGE
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] New Configuration:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.6
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.7
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Left:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ] Members Joined:
> >>>>> Mar 28 13:04:52 [NOTICE  ] [CLM  ]      10.2.1.6
> >>>>> Mar 28 13:04:52 [NOTICE  ] [SYNC ] This node is within the non-
> >>>>> primary component and will NOT
> >>>>> provide any services.
> >>>>> Mar 28 13:04:52 [NOTICE  ] [TOTEM] entering OPERATIONAL state.
> >>>>> Mar 28 13:04:52 [ERROR   ] [EVT  ] recovery error node: (null)
> >>>>> not found
> >>>>> Assertion failed: (0), function evt_sync_process, file evt.c,
> >>>>> line 4056.
> >>>>>
> >>>>>
> >>>>>
> >>>>> ------- You are receiving this mail because: -------
> >>>>> You are on the CC list for the bug, or are watching someone who  
> >>>>> is.
> >>>> _______________________________________________
> >>>> Openais mailing list
> >>>> Openais at lists.osdl.org
> >>>> https://lists.osdl.org/mailman/listinfo/openais
> >>>> <defect-1153-1.patch>
> >>> _______________________________________________
> >>> Openais mailing list
> >>> Openais at lists.osdl.org
> >>> https://lists.osdl.org/mailman/listinfo/openais
> >>
> >> <defect-1170.patch>
> 




More information about the Openais mailing list