[Openais] CKPT: bug, global_ckpt_id not synced
Hans Feldt
Hans.Feldt at ericsson.com
Sun Sep 24 23:56:56 PDT 2006
Committed revision 1242.
Hans Feldt wrote:
> Sorry the fix I did was not 100% correct. If only one checkpoint and
> node exist, another node joins it will execute 'if (0 > 0)' which will
> not increment global_ckpt_id on the joining node. All checkpoints
> created after this will get their IDs out of sync.
>
> In the patch is also a ckpt dump function needed to troubleshoot these
> things.
>
> I must have tested with 2 checkpoints (as the test below describes).
>
> New patch attached.
>
> Regards,
> Hans
>
> Hans Feldt wrote:
>
>> Committed revision 1239.
>>
>> Hans Feldt wrote:
>>
>>> Test case:
>>> - start first node
>>> - create (with data) checkpoint 1 on first node
>>> - create (with data) checkpoint 2 on first node
>>> - start 2nd node
>>> - create (with data) checkpoint 3 on 2nd node
>>> - read checkpoint 3 on first node (fails without patch)
>>>
>>> There seems to be more errors related to the ckpt_id which was
>>> introduced in r1139. Stay tuned or help us out.
>>>
>>> Regards,
>>> Hans
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> Index: ckpt.c
>>> ===================================================================
>>> --- ckpt.c (revision 1238)
>>> +++ ckpt.c (working copy)
>>> @@ -345,6 +345,7 @@
>>>
>>> DECLARE_LIST_INIT(checkpoint_recovery_list_head);
>>>
>>> +/* cluster wide synchronized checkpoint ID */
>>> static mar_uint32_t global_ckpt_id = 0;
>>>
>>> struct checkpoint_cleanup {
>>> @@ -2105,6 +2106,11 @@
>>> log_printf (LOG_LEVEL_DEBUG, "recovery CHECKPOINT reopened is
>>> %p\n", checkpoint);
>>> }
>>>
>>> + /* synchronize global_ckpt_id to max(ckpt_id,global_ckpt_id)+1 */
>>> + if (ckpt_id > global_ckpt_id) {
>>> + global_ckpt_id = ckpt_id + 1;
>>> + }
>>> +
>>> /*CHECK to see if there are any existing ckpts*/
>>> if ((checkpoint->ckpt_refcnt) &&
>>> (ckpt_refcnt_total(checkpoint->ckpt_refcnt) > 0)) {
>>> log_printf (LOG_LEVEL_DEBUG,"calling merge_ckpt_refcnts\n");
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> _______________________________________________
>>> Openais mailing list
>>> Openais at lists.osdl.org
>>> https://lists.osdl.org/mailman/listinfo/openais
>>
>>
>>
>> _______________________________________________
>> Openais mailing list
>> Openais at lists.osdl.org
>> https://lists.osdl.org/mailman/listinfo/openais
>>
>
>
> ------------------------------------------------------------------------
>
> Index: ckpt.c
> ===================================================================
> --- ckpt.c (revision 1240)
> +++ ckpt.c (working copy)
> @@ -337,6 +337,8 @@
> mar_uint32_t data_offset,
> mar_uint32_t data_size);
>
> +static void dump_fn(void);
> +
> static int process_localhost_transition = 0;
>
> DECLARE_LIST_INIT(checkpoint_list_head);
> @@ -551,7 +553,7 @@
> .lib_service = ckpt_lib_service,
> .lib_service_count = sizeof (ckpt_lib_service) / sizeof (struct openais_lib_handler),
> .exec_init_fn = ckpt_exec_init_fn,
> - .exec_dump_fn = 0,
> + .exec_dump_fn = dump_fn,
> .exec_service = ckpt_exec_service,
> .exec_service_count = sizeof (ckpt_exec_service) / sizeof (struct openais_exec_handler),
> .confchg_fn = ckpt_confchg_fn,
> @@ -2107,7 +2109,7 @@
> }
>
> /* synchronize global_ckpt_id to max(ckpt_id,global_ckpt_id)+1 */
> - if (ckpt_id > global_ckpt_id) {
> + if (ckpt_id >= global_ckpt_id) {
> global_ckpt_id = ckpt_id + 1;
> }
>
> @@ -4238,3 +4240,54 @@
> section_id_size);
> }
> }
> +
> +static void dump_fn (void)
> +{
> +#ifdef DEBUG
> + struct list_head *checkpoint_list;
> + struct checkpoint *checkpoint;
> + struct list_head *checkpoint_section_list;
> + struct checkpoint_section *section;
> +
> + log_printf (LOG_LEVEL_NOTICE,
> + "========== Checkpoint Information ===========");
> + log_printf (LOG_LEVEL_NOTICE, "global_ckpt_id: %u", global_ckpt_id);
> +
> + for (checkpoint_list = checkpoint_list_head.next;
> + checkpoint_list != &checkpoint_list_head;
> + checkpoint_list = checkpoint_list->next) {
> +
> + checkpoint = list_entry (checkpoint_list, struct checkpoint, list);
> +
> + if (checkpoint == NULL) {
> + return;
> + }
> +
> + log_printf (LOG_LEVEL_NOTICE, "Checkpoint %s (%d):",
> + checkpoint->name.value, checkpoint->name.length);
> + log_printf (LOG_LEVEL_NOTICE, " id: %u", checkpoint->ckpt_id);
> + log_printf (LOG_LEVEL_NOTICE, " sec cnt: %u", checkpoint->sectionCount);
> + log_printf (LOG_LEVEL_NOTICE, " ref cnt: %u", checkpoint->referenceCount);
> + log_printf (LOG_LEVEL_NOTICE, " unlinked: %u", checkpoint->unlinked);
> +
> + for (checkpoint_section_list = checkpoint->sections_list_head.next;
> + checkpoint_section_list != &checkpoint->sections_list_head;
> + checkpoint_section_list = checkpoint_section_list->next) {
> +
> + section = list_entry (checkpoint_section_list,
> + struct checkpoint_section, list);
> +
> + log_printf (LOG_LEVEL_NOTICE, " Section %s (%d)",
> + section->section_descriptor.section_id.id,
> + section->section_descriptor.section_id.id_len);
> + log_printf (LOG_LEVEL_NOTICE, " size: %llu",
> + section->section_descriptor.section_size);
> + log_printf (LOG_LEVEL_NOTICE, " state: %u",
> + section->section_descriptor.section_state);
> + log_printf (LOG_LEVEL_NOTICE, " exp time: %llu",
> + section->section_descriptor.expiration_time);
> + }
> + }
> +#endif
> +}
> +
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> Openais mailing list
> Openais at lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/openais
More information about the Openais
mailing list