[Openais] CKPT: bug, global_ckpt_id not synced

Hans Feldt Hans.Feldt at ericsson.com
Sun Sep 24 23:56:56 PDT 2006


Committed revision 1242.

Hans Feldt wrote:
> Sorry the fix I did was not 100% correct. If only one checkpoint and 
> node exist, another node joins it will execute 'if (0 > 0)' which will 
> not increment global_ckpt_id on the joining node. All checkpoints 
> created after this will get their IDs out of sync.
> 
> In the patch is also a ckpt dump function needed to troubleshoot these 
> things.
> 
> I must have tested with 2 checkpoints (as the test below describes).
> 
> New patch attached.
> 
> Regards,
> Hans
> 
> Hans Feldt wrote:
> 
>> Committed revision 1239.
>>
>> Hans Feldt wrote:
>>
>>> Test case:
>>> - start first node
>>> - create (with data) checkpoint 1 on first node
>>> - create (with data) checkpoint 2 on first node
>>> - start 2nd node
>>> - create (with data) checkpoint 3 on 2nd node
>>> - read checkpoint 3 on first node (fails without patch)
>>>
>>> There seems to be more errors related to the ckpt_id which was 
>>> introduced in r1139. Stay tuned or help us out.
>>>
>>> Regards,
>>> Hans
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> Index: ckpt.c
>>> ===================================================================
>>> --- ckpt.c    (revision 1238)
>>> +++ ckpt.c    (working copy)
>>> @@ -345,6 +345,7 @@
>>>
>>> DECLARE_LIST_INIT(checkpoint_recovery_list_head);
>>>
>>> +/* cluster wide synchronized checkpoint ID */
>>> static mar_uint32_t global_ckpt_id = 0;
>>>
>>> struct checkpoint_cleanup {
>>> @@ -2105,6 +2106,11 @@
>>>         log_printf (LOG_LEVEL_DEBUG, "recovery CHECKPOINT reopened is 
>>> %p\n", checkpoint);
>>>     }
>>>
>>> +    /* synchronize global_ckpt_id to max(ckpt_id,global_ckpt_id)+1 */
>>> +    if (ckpt_id > global_ckpt_id) {
>>> +        global_ckpt_id = ckpt_id + 1;
>>> +    }
>>> +
>>>     /*CHECK to see if there are any existing ckpts*/
>>>     if ((checkpoint->ckpt_refcnt) &&  
>>> (ckpt_refcnt_total(checkpoint->ckpt_refcnt) > 0)) {
>>>         log_printf (LOG_LEVEL_DEBUG,"calling merge_ckpt_refcnts\n");
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> _______________________________________________
>>> Openais mailing list
>>> Openais at lists.osdl.org
>>> https://lists.osdl.org/mailman/listinfo/openais
>>
>>
>>
>> _______________________________________________
>> Openais mailing list
>> Openais at lists.osdl.org
>> https://lists.osdl.org/mailman/listinfo/openais
>>
> 
> 
> ------------------------------------------------------------------------
> 
> Index: ckpt.c
> ===================================================================
> --- ckpt.c	(revision 1240)
> +++ ckpt.c	(working copy)
> @@ -337,6 +337,8 @@
>  	mar_uint32_t data_offset,
>  	mar_uint32_t data_size);
>  
> +static void dump_fn(void);
> +
>  static int process_localhost_transition = 0;
>  
>  DECLARE_LIST_INIT(checkpoint_list_head);
> @@ -551,7 +553,7 @@
>  	.lib_service			= ckpt_lib_service,
>  	.lib_service_count		= sizeof (ckpt_lib_service) / sizeof (struct openais_lib_handler),
>  	.exec_init_fn			= ckpt_exec_init_fn,
> -	.exec_dump_fn			= 0,
> +	.exec_dump_fn			= dump_fn,
>  	.exec_service			= ckpt_exec_service,
>  	.exec_service_count		= sizeof (ckpt_exec_service) / sizeof (struct openais_exec_handler),
>  	.confchg_fn			= ckpt_confchg_fn,
> @@ -2107,7 +2109,7 @@
>  	}
>  
>  	/* synchronize global_ckpt_id to max(ckpt_id,global_ckpt_id)+1 */
> -	if (ckpt_id > global_ckpt_id) {
> +	if (ckpt_id >= global_ckpt_id) {
>  		global_ckpt_id = ckpt_id + 1;
>  	}
>  
> @@ -4238,3 +4240,54 @@
>  				section_id_size);
>  	}
>  }
> +
> +static void dump_fn (void)
> +{
> +#ifdef DEBUG
> +	struct list_head *checkpoint_list;
> +	struct checkpoint *checkpoint;
> +	struct list_head *checkpoint_section_list;
> +	struct checkpoint_section *section;
> +
> +	log_printf (LOG_LEVEL_NOTICE,
> +		"========== Checkpoint Information ===========");
> +	log_printf (LOG_LEVEL_NOTICE, "global_ckpt_id: %u", global_ckpt_id);
> +
> +	for (checkpoint_list = checkpoint_list_head.next;
> +		checkpoint_list != &checkpoint_list_head;
> +		checkpoint_list = checkpoint_list->next) {
> +
> +		checkpoint = list_entry (checkpoint_list, struct checkpoint, list);
> +
> +		if (checkpoint == NULL) {
> +			return;
> +		}
> +
> +		log_printf (LOG_LEVEL_NOTICE, "Checkpoint %s (%d):",
> +			checkpoint->name.value, checkpoint->name.length);
> +		log_printf (LOG_LEVEL_NOTICE, "   id:       %u", checkpoint->ckpt_id);
> +		log_printf (LOG_LEVEL_NOTICE, "   sec cnt:  %u", checkpoint->sectionCount);
> +		log_printf (LOG_LEVEL_NOTICE, "   ref cnt:  %u", checkpoint->referenceCount);
> +		log_printf (LOG_LEVEL_NOTICE, "   unlinked: %u", checkpoint->unlinked);
> +
> +		for (checkpoint_section_list = checkpoint->sections_list_head.next;
> +			checkpoint_section_list != &checkpoint->sections_list_head;
> +			checkpoint_section_list = checkpoint_section_list->next) {
> +
> +			section = list_entry (checkpoint_section_list,
> +				struct checkpoint_section, list);
> +
> +			log_printf (LOG_LEVEL_NOTICE, "   Section %s (%d)",
> +				section->section_descriptor.section_id.id,
> +				section->section_descriptor.section_id.id_len);
> +			log_printf (LOG_LEVEL_NOTICE, "      size:     %llu",
> +				section->section_descriptor.section_size);
> +			log_printf (LOG_LEVEL_NOTICE, "      state:    %u",
> +				section->section_descriptor.section_state);
> +			log_printf (LOG_LEVEL_NOTICE, "      exp time: %llu",
> +				section->section_descriptor.expiration_time);
> +		}
> +	}
> +#endif
> +}
> +
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Openais mailing list
> Openais at lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/openais




More information about the Openais mailing list