[Openais] Patch README.amf with corresponding conf files

Hans Feldt Hans.Feldt at ericsson.com
Sun Sep 24 23:58:07 PDT 2006


Committed revision 1244.

Anders Eriksson wrote:
> This patch contains an update of README.amf to match current state of 
> the implementation. README.amf now also includes a detailed list of what 
> is currently NOT implemented.
> 
> README.amf includes now, as before, a "demo example". This example 
> requires a specific configuration to run the way it is described. 
> Because of that versions of amf.conf and openais.conf which match the 
> demo example are attached.
> 
> Regards,
> Anders Eriksson
> 
> 
> ------------------------------------------------------------------------
> 
> # AMF Example configuration file, please read README.amf
> # - Times in milliseconds
> # - clccli_path can be set on any level from application and down and will be
> # added to the CLI commands if they are not already specified with an absolute
> # path (begins with /).
> # WL - WorkLoad
> 
> safAmfCluster = TEST_CLUSTER {
> 	saAmfClusterStartupTimeout=3000
> 	safAmfNode = AMF1 {
> 		saAmfNodeSuFailOverProb=2000
> 		saAmfNodeSuFailoverMax=2
> 		saAmfNodeClmNode=seasc0035
> 	}
> #	safAmfNode = AMF2 {
> #		saAmfNodeSuFailOverProb=2000
> #		saAmfNodeSuFailoverMax=2
> #		saAmfNodeClmNode=p02
> #	}
> 	safApp = APP-1 {
> 		safSg = RAID {
> 			saAmfSGRedundancyModel=nplusm	
> 			saAmfSGNumPrefActiveSUs=1
> 			saAmfSGMaxActiveSIsperSUs=2
> 			saAmfSGNumPrefStandbySUs=1
> 			saAmfSGMaxStandbySIsperSUs=2
> 			saAmfSGCompRestartProb=100000
> 			saAmfSGCompRestartMax=2
> 			saAmfSGSuRestartProb=20000	
> 			saAmfSGSuRestartMax=3
> 			saAmfSGAutoAdjustProb=5000
> 			safSu = SERVICE_X_1 {
> 				saAmfSUHostedByNode=AMF1
> 				saAmfSUNumComponents=1
> 				safComp = A {
> 					saAmfCompCategory=sa_aware
> 					saAmfCompCapability=x_active_or_y_standby
> 					saAmfCompNumMaxActiveCsi=1
> 					saAmfCompNumMaxStandbyCsi=1
> 					saAmfCompDefaultClcCliTimeout = 500
> 					saAmfCompDefaultCallbackTimeOut = 500
> 					saAmfCompInstantiateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompInstantiateCmdArgv= instantiate /tmp/aisexample/testamf1
> 					saAmfCompTerminateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompTerminateCmdArgv = terminate
> 					saAmfCompCleanupCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompCleanupCmdArgv = cleanup
> 					saAmfCompCsTypes {
> 						A
> 					}
> 					saAmfCompCmdEnv {
> 						var1=val1
> 						var2=val2
> 					}
> 					saAmfCompRecoveryOnError=component_restart
> 					safHealthcheckKey = key1 {
> 						saAmfHealthcheckPeriod = 5000
> 						saAmfHealthcheckMaxDuration = 350
> 					}
> 				}
> 				safComp = B {
> 					saAmfCompCategory=sa_aware
> 					saAmfCompCapability=x_active_or_y_standby
> 					saAmfCompNumMaxActiveCsi=1
> 					saAmfCompNumMaxStandbyCsi=1
> 					saAmfCompDefaultClcCliTimeout = 500
> 					saAmfCompDefaultCallbackTimeOut = 500
> 					saAmfCompInstantiateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompInstantiateCmdArgv= instantiate /tmp/aisexample/testamf1
> 					saAmfCompTerminateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompTerminateCmdArgv = terminate
> 					saAmfCompCleanupCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompCleanupCmdArgv = cleanup
> 					saAmfCompCsTypes {
> 						B
> 					}
> 					saAmfCompCmdEnv {
> 						var1=val1
> 						var2=val2
> 					}
> 					saAmfCompRecoveryOnError=component_restart
> 					safHealthcheckKey = key1 {
> 						saAmfHealthcheckPeriod = 1000
> 						saAmfHealthcheckMaxDuration = 350
> 					}
> 				}
> 			}
> 			safSu = SERVICE_X_2 {
> 				clccli_path=/tmp/aisexample
> 				saAmfSUHostedByNode=AMF1
> #				saAmfSUHostedByNode=AMF2
> 				saAmfSUNumComponents=1
> 				safComp = A {
> 					saAmfCompCategory=sa_aware
> 					saAmfCompCapability=x_active_or_y_standby
> 					saAmfCompNumMaxActiveCsi=1
> 					saAmfCompNumMaxStandbyCsi=1
> 					saAmfCompDefaultClcCliTimeout = 500
> 					saAmfCompDefaultCallbackTimeOut = 500
> 					saAmfCompInstantiateCmd = clc_cli_script
> 					saAmfCompInstantiateCmdArgv= instantiate
> 					saAmfCompTerminateCmd = clc_cli_script
> 					saAmfCompTerminateCmdArgv = terminate
> 					saAmfCompCleanupCmd = clc_cli_script
> 					saAmfCompCleanupCmdArgv = cleanup
> 					saAmfCompCsTypes {
> 						A
> 					}
> 					saAmfCompCmdEnv {
> 						COMP_BINARY_PATH=/tmp/aisexample
> 						COMP_BINARY_NAME=testamf1
> 						var1=val1
> 						var2=val2
> 					}
> 					saAmfCompRecoveryOnError=component_restart
> 					safHealthcheckKey = key1 {
> 						saAmfHealthcheckPeriod = 5000
> 						saAmfHealthcheckMaxDuration = 350
> 					}
> 					safHealthcheckKey = key2 {
> 						saAmfHealthcheckPeriod = 3000
> 						saAmfHealthcheckMaxDuration = 350
> 					}
> 				}
> 				safComp = B {
> 					saAmfCompCategory=sa_aware
> 					saAmfCompCapability=x_active_or_y_standby
> 					saAmfCompNumMaxActiveCsi=1
> 					saAmfCompNumMaxStandbyCsi=1
> 					saAmfCompDefaultClcCliTimeout = 500
> 					saAmfCompDefaultCallbackTimeOut = 500
> 					saAmfCompInstantiateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompInstantiateCmdArgv= instantiate /tmp/aisexample/testamf1
> 					saAmfCompTerminateCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompTerminateCmdArgv = terminate
> 					saAmfCompCleanupCmd = /tmp/aisexample/clc_cli_script
> 					saAmfCompCleanupCmdArgv = cleanup
> 					saAmfCompCsTypes {
> 						B
> 					}
> 					saAmfCompCmdEnv {
> 						var1=val1
> 						var2=val2
> 					}
> 					saAmfCompRecoveryOnError=component_restart
> 					safHealthcheckKey = key1 {
> 						saAmfHealthcheckPeriod = 5000
> 						saAmfHealthcheckMaxDuration = 350
> 					}
> 				}
> 			}
> 		}
> 		safSi = WL1 {
> 			saAmfSINumCSIs=2
> 			safCsi = WL1-1 {
> 				saAmfCSTypeName = A
> 			}
> 			safCsi = WL1-2 {
> 				saAmfCSTypeName = B
> 				safCSIAttr = attr1 {
> 					val1
> 					val2
> 				}
> 				safCSIAttr = good_health_limit {
> 					10
> 				}
> 			}
> 		}
> 		safSi = WL2 {
> 			saAmfSINumCSIs=2
> 			safCsi = WL2-1 {
> 				saAmfCSTypeName = A
> 			}
> 			safCsi = WL2-2 {
> 				saAmfCSTypeName = B
> 				safCSIAttr = attr1 {
> 					val1
> 					val2
> 				}
> 				safCSIAttr = good_health_limit {
> 					10
> 				}
> 			}
> 		}
> 		safCSType = A {
> 			safAmfCSAttrName = attr1
> 			safAmfCSAttrName = good_health_limit
> 		}
> 		safCSType = B {
> 		}
> 	}
> }
> 
> 
> 
> ------------------------------------------------------------------------
> 
> # Please read the openais.conf.5 manual page
> 
> totem {
> 	version: 2
> 	secauth: off
> 	threads: 0
> 	interface {
> 		ringnumber: 0
> 		bindnetaddr: 127.0.0.0
> 		mcastaddr: 226.94.1.1
> 		mcastport: 5405
> 	}
> }
> 
> logging {
> 	fileline: off
> 	to_stderr: yes
> 	to_file: yes
> 	logfile: /tmp/openais.log
> 	debug: off
> 	timestamp: on
> 	logger {
> 		ident: AMF
> 		debug: off
> 		tags: enter|leave|trace1|trace2|trace3|trace4|trace6
> 	}
> }
> 
> amf {
> 	mode: enabled
> }
> 
> aisexec {
>     user: nisse
>     group: cello
> }
> 
> 
> ------------------------------------------------------------------------
> 
> Index: README.amf
> ===================================================================
> --- README.amf	(revision 1241)
> +++ README.amf	(working copy)
> @@ -1,228 +1,537 @@
> -AMF B.01.01 Implementation
> +AMF B.02.01 Implementation
>  --------------------------
> -This patch contains the basis of the AMF B.01.01 service targeted for release
> -in Wilson (1.0).  It is a work in progress and incomplete at this time.
> +The implementation of AMF in openais is directed by the specification 
> +SAI-AIS-AMF-B.02.01, see http://www.saforum.org/specification/.
>  
>  What does AMF do?
>  -----------------
>  The AMF has many major duties:
>   * issue instantiate, terminate, and cleanup operations for components
>   * assignment of component service instances to components
> - * detection of component faults and executing recovery actions
> + * executing of recovery and repair actions on fault reports delivered
> +   by components (fault detection is a responsibility of all entities
> +   in the system)
>  
> -The AMF starts and stops processes that are part of the component.  A SU
> -contains multiple components.  A service group contains multiple SUs.
> -A SU is the unit of redundancy used to implement high availability.
> +An AMF user has to provide instantiate and cleanup commands and a
> +configuration file  besides from the binaries that represents the actual
> +components.
>  
> -The process of starting and stopping components takes place using the CLC
> -operations.  The AMF specification is exceedingly clear about which CLC
> -operations occur for which component types and openais implements the full
> -CLC operations for all of the various component types.
> +To start a component, AMF executes the instantiate command which starts
> +processes that are part of the component. AMF can stop the component
> +abruptly by running the cleaup command.
>  
> +An service unit (SU) contains multiple components and represents a 
> +"useable service" and is configured to execute on an AMF node. The AMF node
> +is mapped in the configuration to a CLM node which is "an operating system
> +instance". An SU is the smallest part that can be instantiated in a redundant
> +manner and can therefore be viewed as the unit of redundancy.
> +
> +A service group (SG) contains multiple SUs. The SG is the unit that implements
> +high availability by managing its contained service units. An SG can be
> +configured to execute different redundancy policies. 
> +
> +An application contains multiple SGs and multiple service instances (SIs).
> + 
> +An SI represents the workload for an SU. An SI consists of one or more
> +component service instances (CSIs). 
> +
> +A CSI represents the workload of a component. The CSI is configured to include
> +a list of name value pairs through which the user can express the workload.
> +
> +The AMF specification defines several types of components.  The AMF
> +specification is exceedingly clear about which CLC operations occur for which
> +component types.
> +
>  If a component is not sa-aware, the only level of high availability that
>  can be applied to the application is through execution of the CLC interfaces.
>  
>  A special component, called a proxy component, can be used to present an
> -sa-aware component to AMF to manage a non-sa-aware component.  This would be 
> +SA-aware component to AMF to manage a non-SA-aware component.  This would be 
>  useful, for example, to implement a healthcheck operation which runs some
>  operation of the unmodified application service.
>  
> -Components that are sa-aware have been written specifically to the AMF
> +Components that are SA-aware have been written specifically to the AMF
>  interfaces.  These components provide the most support for high availability
>  for application developers.
>  
> -When an sa-aware component is registered, service instances are assigned
> -to the component once the service unit is available to take service.  This
> -service instance specifies whether the component is ACTIVE or STANDBY.  The
> -component is directed by the AMF to enter either ACTIVE or STANDBY states
> -and then executes its assigned operational mode.  The number of CSIs assigned
> -to a component is determined by a reduction process with 6 levels of
> -reduction.  The AMF provides a very clear definition of what is required
> -with several examples for each reduction level.
> +When an SA-aware component has been instantiated it has to register within a
> +certain time. After a successful registration, AMF assigns workload to the
> +component by making callbacks once the service unit is available to take service.
> +There will be one callback for each CSI-assignment. Each CSI-assignment has
> +a HA state associated which indicates how the component shall act.
> +The HA state can be ACTIVE, STANDBY, QUIESCED or QUIESCING.
>  
> -The AMF detects faults through the use of a healthcheck operation.  The user
> -specifies in a configuration file healthcheck keys and timing parameters.
> +The number of CSIs assigned to a component and the setting of their HA state
> +is determined by AMF. In the configuration the operator specifies the preferred
> +assignment of workload to the defined SUs. The configuration specifies also
> +limits for how much work each SU can execute. If not the preferred distribution
> +of workload can be met due to problems in the cluster a reduction process with
> +6 levels of reduction will be executed by AMF. The purpose of the reduction
> +procedure is to come as close as possible to the preferred configuration without
> +violating any limits for how much workload an SU can handle. The reduction
> +procedure continues until there are no SUs in-service in the SG.
> +
> +AMF supports fault detection through a healthcheck API.  The user
> +specifies in the configuration file healthcheck keys and timing parameters.
>  This configuration is then used by the application developer to register
>  a healthcheck operation in the AMF.  The healthcheck operation can be started
>  or stopped.  Once started, the AMF will periodically send a request to the
> -component to determine its level of health.  The AMF reacts to negative
> -healthchecks or failed healthchecks by executing a recovery policy.
> +component to determine its level of health. Optionally, AMF can be configured to
> +instead expect the component to report its health periodically. 
> +The AMF reacts to negative healthchecks or failed healthchecks by executing 
> +a recovery policy.
>  
> -The recovery policy attempts to restart components first.  When components
> -are restarted and fail a certain number of times within a timeout period, the
> -entire service unit is failed over.  When SUs on one node are restarted and fail
> -a certain number of times within a timeout period, the service unit is failed
> -over to a standby service unit.
> +The AMF specification also includes an API for reporting errors with a 
> +recommended recovery action. AMF will not take a weaker recovery action than
> +what is recommended but may take a stronger action based on the recovery
> +escalation policy.
>  
> -Currently openais implements most of what is described above.
> +There is a recovery escalation policy for the recomendations:
> +- component restart
> +- component failover
>  
> -How to configure AMF
> ---------------------
> -The AMF doesn't specify a configuration file format.  It does specify many
> -configuration options, which are mostly implemented in openais.  The
> -configuration file specifies the service groups, service units, service
> -instances, recovery configuration options, and information describing where
> -components and CLI (command line interface) tools are located.
> +When AMF receives a recommendation to restart a component, the recovery policy
> +attempts to restart the component first.  When the component is restarted and
> +fail a certain number of times within a timeout period, the entire service unit
> +is restarted. When the SU has been restarted a certain number of times within
> +a certain timeout period, the SU is failed over to a standby SU. If AMF fails
> +over too many service units out of the same node in a given time period as a
> +consequence of error reports with either component restart or component
> +failover recommended recovery actions, the AMF escalates the recovery to an 
> +entire node fail-over.
>  
> -There are several configuration options which are used to control the component
> -life cycle (CLC) of the component.  These configuration options are:
> +What is currently implemented ?
> +-------------------------------
>  
> -in the group section:
> -clccli_path=/home/sdake/amfb-dec/test
> -  The path to the CLC CLI applications.
> +SA-aware components can be instantiated and assigned load according to the
> +configuration specified in amf.conf. Other types of components are currently
> +not supported. The processes of instantiation and assignment of workload are 
> +both simplified compared to the requirements in the AMF specification.
>  
> -binary_path=/home/sdake/amfb-dec/test
> -  The path to the components.
> +Service units represented by their components can be configured to execute
> +on different nodes. AMF supports initial start of the cluster as well as adding
> +of a node to the cluster after the initial start. AMF also supports that a node
> +leave the cluster by failing over the workload to standby service units.
>  
> -in the unit section:
> -bn=testamf1
> -  The bn parameter specifies the binary name of the application that should be
> -  run by the instantion script.  Note instantiate may already know this
> -  information and hence, this is optional.
> +Healthchecks are implemented as specified with only a few details missing.
>  
> -instantiate=clc_cli_script
> -  The instantiate parameter specifies the CLC-CLI binary program to be run to
> -  instantiate a component.  An instantiation starts the processes representing
> -  the component.
> +The error report API is implemented but AMF ignores the recommendation of
> +recovery action instead it will always try to recover by 'component restart'.
> + 
> +The error escalation mechanism up to SU failover is also implemented as
> +specified with a few simplifications.
>  
> -terminate=clc_cli_script
> -  The terminate parameter specifies the CLC-CLI binary program to be run to
> -  terminate a component.  A terminate CLC terminates the processes representing
> -  the component nicely by properly shutting down.
> +Only redundancy model N+M is (partly) implemented.
>  
> -cleanup=clc_cli_script
> -  The cleanup parameter specifies the CLC-CLI binary program to be run to
> -  cleanup a component.  A cleanup CLC terminates the processes representing
> -  the component abruptly.
> +You can find a detailed list of what is NOT implemented later in the README.
>  
> -There are several options to describe the component recovery escalation
> -policies.  These are:
> +How to configure AMF
> +--------------------
> +The AMF specification doesn't specify a configuration file format. It does
> +however, describe many configuration options, which are specified formally in 
> +SAI-Overview-B.02.01 chapter 4.5 - 4.11. The Overview can also be retrieved
> +from http://www.saforum.org/specification/.
>  
> -component_restart_probation=100000
> -  This specifies the number of milliseconds that a component can be restarted
> -  in escalation level 0 (only restart components) before escalating to level 1.
> +An implementation specific feature of openais is to implement the configuration
> +options in a file called amf.conf. There is a man page in the /man directory
> +which describes the syntax of amf.conf and what configuration options which
> +are currently supported.
>  
> -component_restart_max=4
> -  This specifies the number of times within component_restart_probation period
> -  before escalating from level 0 to level 1.
> +The example programs
> +--------------------
> +First the openais example programs should be installed.  When compiling openais
> +in the exec directory a file called openais-instantiate is created.  Copy this
> +file to a test directory of your own:
>  
> -unit_restart_probation=200000
> -  This specifies the number of milliseconds that a unit can be restarted
> -  in escalation level 1 (restart entire SU) before escalating to level 2.
> +mkdir /tmp/aisexample
>  
> -unit_restart_max=6
> -  This specifies the number of times within unit_restart_probation period
> -  before escalating from level 1 to level 2.
> +exec# cp openais-instantiate /tmp/aisexample
>  
> -The AMF will execute a N+M reduction process based upon the number of service
> -instances specified in the configuration file and 4 configuration options
> -at the groups level:
> +Copy also the script which implements the instantiate, terminate and clean-up
> +operations to your test directory:
>  
> -preferred-active-units=3
> -  This is the preferred number of active units that should be active.
> +exec# cp ../test/clc_cli_script /tmp/aisexample/clc_cli_script
>  
> -maximum-active-instances=3
> -  This is the naximum number of active CSIs that can be assigned to a component.
> +Set execute permissions for the clc_cli_script
>  
> -preferred-standby-units=2
> -  This is the preferred number of standby units that should be active.
> +exec# chmod +x /tmp/aisexample/clc_cli_script
>  
> -maximum-standby-instances=4
> -  This is the naximum number of standby CSIs that can be assigned to a component.
> +Copy the binary to be used for all components:
> +exec# cp ../test/testamf1 /tmp/aisexample/testamf1
>  
> -A service instance is specified only as a name.  If there are 4 SIs, the
> -reduction process will execute as per the AMF specification to assign the proper
> -number of active and standby CSIs to components currently registered.  This
> -is a little buggy at the moment.
> +Copy the amf example configuration files from the openais/conf directory to
> +your test directory.
>  
> -serviceinstance {
> -	name = siaa
> -}
> +exec# cp ../conf/*amf_example.conf /tmp/aisexample
>  
> -Failure detection occurs through the healthcheck option.  The healthcheck
> -options are
> -key
> -  The name of the healthcheck parameter
> +set environment variables to the names of the configuration files: 
>  
> -period
> -  The number of milliseconds to wait before issueing a new healthcheck.
> +setenv OPENAIS_AMF_CONFIG_FILE /tmp/aisexample/amf_example.conf
> +setenv OPENAIS_MAIN_CONFIG_FILE /tmp/aisexample/openais_amf_example.conf
>  
> -maximum_duration
> -  The maximum amount of time to wait for a healthcheck to complete before
> -  declaring a failure.
> +You have to specify the host on which you would like to execute the AMF example.
> +Open the file 'amf_example.conf' and replace the line:
>  
> +saAmfNodeClmNode=p01
>  
> -The example programs
> ---------------------
> -First the openais test programs should be installed.  When compiling openais
> -in the exec directory a file called openais-instantiate is created.  Copy this
> -to the test directory
> +in the following section in the cluster configuration:
>  
> -exec# cp openais-instantiate ../test
> +	safAmfNode = AMF1 {
> +		saAmfNodeSuFailOverProb=2000
> +		saAmfNodeSuFailoverMax=2
> +		saAmfNodeClmNode=p01
> +	}
>  
> -Set execute permissions for the clc_cli_script
> +p01 shall be replaced with the name of your host.
>  
> -exec# cd ../test
> -test# chmod +x ../clc_cli_script
> +(You can obtain the name of your host by typing the command 'hostname' in a 
> +shell.)
>  
> -IMPORTANT NOTE:
> -Within the amf stanza, the mode variable should be set to enabled.  This option
> -defaults to off and the default configuration file turns this off as well.
> -This is configured off by default to keep from confusing openais users
> -interested in using AIS without the alpha-AMF.
> +Modify the following rows of 'openais_amf_example.conf' so that they match your
> +user and group:
>  
> -example openais.conf:
> -amf {
> -	mode: enabled
> +aisexec {
> +    user: eraanee
> +    group: cello
>  }
>  
> -The following two paths must be set in the groups.conf file:
> -       clccli_path=/home/sdake/amfb-l/test
> -       binary_path=/home/sdake/amfb-l/test
> +(One way to obtain your user and group is to type the command 'id' in a shell.)
>  
> -If these are not set, the path to the clc_cli_script and component binaries
> -cannot be determined and AMF will not institate the testamf1 binary.
> +Start aisexec by command:
> +./aisexec
>  
> -Once aisexec is run using the default configuration file, 5 service units
> -will be instantiated.  The testamf1 C code will be used for all 5 SUs
> -and both comp_a and comp_b.  The testamf1 program determines its component
> -name at start time from the saAmfComponentNameGet api call.  The result is
> -that 10 processes will be started by AMF.
> +aisexec will be run in the background.
> +Once aisexec is run using the example configuration file, 2 service units
> +will be instantiated.  The testamf1 C code will be used for both component A
> +and component B of both SUs.  The testamf1 program determines its
> +component name at start time from the saAmfComponentNameGet() api call.
> +The result is that 4 processes will be started by AMF.
>  
> -The testamf1 will be assigned CSIs after they execute a saAmfComponentRegister
> -operation.  Note this operation causes the presence state of the testamf1
> -component to be set to INSTANTIATED as required by the AMF specification.  The
> -service instances and their names are defined within the configuration file.
> +Each testamf1 process will first try to register a bad component name and
> +there after register the name returned from saAmfComponentNameGet().
> +The testamf1 will be assigned CSIs after they execute a 
> +saAmfComponentRegister() API call.  Note that a successful registration causes
> +the state of the component and service units to be set to INSTANTIATED as
> +required by the AMF specification.  The service instances and their names are
> +defined within the configuration file.
>  
> -The testamf1 program reports an error via saAmfErrorReport after 10
> -healthchecks.  This results in openais calling the cleanup handler, which for
> +The component of type saAmfCSTypeName = B, which have the active HA state,
> +in this case, safComp=B,safSu=SERVICE_X_1,safSg=RAID,safApp=APP-1,
> +reports an error via saAmfErrorReport() after exactly 10 healthchecks.
> +The healthcheck period is configured to 1 second so one error report is sent
> +every 10th second.
> +This results in openais calling the cleanup handler, which for
>  an sa-aware component, is the CLC_CLI_CLEANUP command.  This causes the cleanup
>  operation of the clc_cli_script to be run.  This cleanup command then reads the
> -pid of the process that was stored to /var/run at startup of the testamf1
> -program.  It then executes a kill -9 on the PID.  Custom cleanup operations can
> -be executed by modifying the clc_cli_script script program.
> +pid of the process that was stored to /var/run ( or /tmp) at startup of the
> +testamf1 program.  It then executes a kill -9 on the PID.  Custom cleanup
> +operations can be executed by modifying the clc_cli_script script program.
>  
> -After this is done 4 times (configurable) the entire service
> -unit is terminated and restarted. Once this happens 6 times, the code
> -escalates to level 2, which is currently unimplemented.
> +After this is done 2 times (configurable) the entire service
> +unit is terminated and restarted due to the error escalation mechanism. Once
> +this happens 3 times (also configurable), the code escalates to level 2 and a
> +failover of the SU takes place. After this testamf1 makes no more error
> +reports and nothing will happen until some problem is recognized (like the
> +process of one of the components stops executing).
>  
> -Currently working:
> -component register, healthcheck start and stop, csi assignment, n+m with
> -all 6 reduction levels, error report, amf response, terminate, cleanup and
> -restart escalation levels 0-1, single node (multinode not tested),
> -setting presence and operational state of components internally, initial
> -assignment of n+m csis based upon configuration options and fully
> -following AIS AMF B spec.
> +The states of the cluster and its contained entities can be obtained by issuing
> +the following command in the shell:
>  
> -Not working or tested:
> -escalation levels 2-3 (switchover/failover), protection group tracking,
> -protection groups in general, any other model besides n+m, amf B
> -specified reassignment of csis to terminated and restarted components,
> -support for proxied or non-sa aware components, state machine for n+m
> -needs alot of work after initial start.  Timeout periods to reduce
> -escalation level for escalation policies are unimplemented.
> +pkill -USR2 ais
>  
> -Any feedback appreciated.
> +Some notes:
> +-----------
> +In the example, testamf1 is sending an error report at the 10th helthcheck.
> +This is actually controlled by the safCSIAttr = good_health_limit in 
> +file amf_example.conf and can be changed as you like.
>  
> -Keep in mind this is very early code and may have many bugs which I'd
> -be happy to have reported :).
> +The file openais_amf_example.conf specifies logging to stderr.
> +
> +If you would like to follow more closely the execution of the AMF in openais,
> +debug printouts can be enabled.
> +
> +example:
> +logging {
> +	fileline: off
> +	to_stderr: yes
> +	to_file: no
> +	logfile: /tmp/openais.log
> +	debug: off
> +	timestamp: on
> +	logger {
> +		ident: AMF
> +		debug: on
> +		tags: enter|leave|trace1|trace2|trace3|trace4|trace6
> +	}
> +
> +Setting 'debug: on' generally gives many printouts all other parts of openais.
> +
> +Run the example on a cluster with 2 nodes
> +-----------------------------------------
> +
> +It is easy to run the example on more than one node.
> +Modify the file openais_amf_example.conf:
> +
> +<1>
> +Replace the following line:
> +		bindnetaddr: 127.0.0.0
> +
> +bindnetaddr specifies the address which the openais Executive should bind to.
> +This address should always end in zero.  If the local interface traffic
> +should be routed over is 192.168.5.92, set bindnetaddr to 192.168.5.0.
> +
> +Modify amf_example.conf like this:
> +<1>
> +Remove the comment character '#' from the following lines:
> +#	safAmfNode = AMF2 {
> +#		saAmfNodeSuFailOverProb=2000
> +#		saAmfNodeSuFailoverMax=2
> +#		saAmfNodeClmNode=p02
> +#	}
> +and replace p02 with the name of your second machine.
> +<2>
> +Locate the following two lines:
> +				saAmfSUHostedByNode=AMF1
> +#				saAmfSUHostedByNode=AMF2
> +
> +Replace them with:
> +
> +#				saAmfSUHostedByNode=AMF1
> +				saAmfSUHostedByNode=AMF2
> +
> +Feedback
> +--------
> +Any feed-back is appreciated.
> +
> +Keep in mind only parts of the functionality is implemented. Reports of bugs or
> +behaviour not compliant with the AMF specification within the implemented part
> +is greatly appreciated :-).
> +
> +What is currently NOT implemented ?
> +-----------------------------------
> +The following list specifies all chapters of the AMF specification which
> +currently is NOT fully implemented. The deviations from the specification are
> +described shortly except in those cases when none of the requirements in the
> +chapter is implemented. 
> +
> +Chapter:				Deviation:
> +---------				----------
> +3.3.1.2 Administrative State		Not supported (always UNLOCKED).
> +3.3.1.4 Readiness State			State STOPPING is not supported.
> +3.3.1.5 Service Unit’s HA State ... 	State QUIESCING is not supported.
> +3.3.2.2 Operational State		AMF does not detect errors in the
> +					following cases:
> +					• A command used by the Availability
> +					  Management Framework to control the
> +					  component life cycle returned an
> +					  error or did not return in time.
> +					• The component fails to respond in
> +					  time to an Availability Management
> +					  Framework's callback.
> +					• The component responds to an
> +					  Availability Management Framework's
> +					  state change callback
> +					  (SaAmfCSISetCallbackT) with an error.
> +					• If the component is SA-aware, and it
> +					  does not register with the
> +					  Availability Management Framework
> +					  within the preconfigured time-period
> +					  after its instantiation.
> +					• If the component is SA-aware, and it
> +					  unexpectedly unregisters with the
> +					  Availability Management Framework.
> +					• The component terminates unexpectedly.
> +					• When a fail-over recovery operation
> +					  performed at the level of the service
> +					  unit or the node containing the 
> +					  service unit triggers an abrupt
> +					  termination of the component.
> +3.3.2.3 Readiness State			State STOPPING is not supported.
> +3.3.2.4 Component’s HA State per ... 	State QUIESCING is not supported.
> +3.3.3.1 Administrative State		Not supported (always UNLOCKED).
> +3.3.5 Service Group States		Administrative state is not supported
> +					(always UNLOCKED).
> +3.3.6.1 Administrative State		Not supported (always UNLOCKED).
> +3.3.6.2 Operational State		None of the rules for transition between states are implemented.
> +3.3.7 Application States		Administrative state is not supported (always UNLOCKED).
> +3.3.8 Cluster States			Administrative state is not supported (always UNLOCKED).
> +3.5.1 Combined States for Pre-Inst....	Only Administrative state = UNLOCKED is supported.
> +3.5.2 Combined States for Non-Pre-I...	Not supported.
> +3.6 Component Capability Model		Configuration of capability model is
> +					ignored. AMF expects all components to
> +					be capable to be x_active_or_y_standby.
> +3.7.2 2N Redundancy Model		Not supported.
> +3.7.3.1 Basics				Spare service units can not be handled
> +					properly.
> +3.7.3.3 Configuration			• Ordered list of service units for a
> +					  service group: Not supported 
> +					  (the order is unpredictable).
> +					• Ordered list of SIs: Neither ranking
> +					  nor dependencies among SIs are 
> +					  supported. SIs are assigned to SUs in 
> +					  any order.
> +					• Auto-adjust option: Not supported.
> +					  Auto-adjust is never done.
> +3.7.3.5.1 Handling of a Node Failure.. 	Not supported.
> +3.7.3.6 An Example of Auto-adjust	Not supported.
> +3.7.4 N-Way Redundancy Model		Not supported.
> +3.7.5 N-Way Active Redundancy Model	Not supported.
> +3.7.6 No Redundancy Model		Not supported.
> +3.7.7 The Effect of Administrative...	Not supported.
> +3.9 Dependencies Among SIs, Compone.. 	Not supported.
> +3.11 Component Monitoring		• Passive Monitoring: Not supported.
> +					• External Active Monitoring:
> +					  Not supported.
> +3.12.1.1 Error Detection		AMF does not support that a component
> +					reports an error for another component.
> +3.12.1.2 Restart			• AMF does not support terminating of
> +					  components by the terminate call-back
> + 					  or the TERMINATE command.
> +					• AMF does not consider component
> +					  instantiation-level at restart.
> +					• The configuration option
> +					  disableRestart is not supported.
> +3.12.1.3 Recovery			• Component or Service Unit Fail-Over:
> +					  • Component fail-over is not
> +					    implemented
> +					  • Only SU fail-over is implemented and
> + 					    the only way to trig that case is by
> +					    error escalation.
> +					• Node Switch-Over: Not implemented
> +					• Node Fail-Over: Not implemented
> +					• Node Fail-Fast: Not implemented
> +					• The configuration option 
> +					  recoveryOnFailure is not handled,
> +					  i.e. is never evaluated. 
> +
> +3.12.1.4 Repair				• The configuration attribute for
> +					  automatic repair is not evaluated.
> +					• The administrative operation 
> +					  SA_AMF_ADMIN_REPAIRED is not 
> +					  implemented.
> +					• Repair after component fail-over
> +					  is not implemented.
> +					• Node leave while performing
> +					  automatic repair of that node,
> +					  is not implemented.
> +					• Service unit failover recovery:
> +					  Is implemented except that an attempt
> +					  to repair is always done (confi- 
> +					  guration attribute is not evaluated).
> +					• Repair after Node Switch-Over,
> +					  Fail-Over or Fail-Fast 
> +					  is not implemented.
> +3.12.1.5 Recovery Escalation		The recommended recovery action is not
> +					evaluated at the reception of an error
> +					report.
> +3.12.2.1 Recommended Recovery Action	The recommended recovery action is
> +					never evaluated. Recovery action
> +					SA_AMF_COMPONENT_RESTART is always
> +					assumed.
> +3.12.2.2 Escalations of Levels 1 and 2	Is implemented with the following exception:
> +					• The configuration attribute
> +					  component_restart_max is compared to
> +					  the restart counter of the component
> +					  that has reported the error instead of
> +					  against the sum of all restart
> +					  counters of all components within 
> +					  the SU.
> +3.12.2.3 Escalation of Level 3		Not implemented
> +4.2 CLC-CLI's Environment Variables	Translation of  non-printable Unicode
> +					characters is not supported.
> +4.4 INSTANTIATE Command			• AMF does not evaluate the exit code of
> +					  the INSTANTIATE command as described
> +					  in the specification.
> +					• AMF does not supervise that an
> +					  SA-aware component registers itself,
> +					  within the time limit configured.
> +					As a consequence, none of the recovery
> +					actions described are implemented.
> +4.5 TERMINATE Command			Not supported.
> +4.6 CLEANUP Command			AMF does not evaluate the exit code of
> +					the CLEANUP command and thus does not
> +					implement any recovery action.
> +4.7 AM_START Command			Not supported.
> +4.8 AM_STOP Command			Not supported.
> +5 Proxied Component Management		Not implemented.
> +7 Administrative API			Not implemented
> +8 Basic Operational Scenarios		Not implemented.
> +9 Alarms and Notifications		Not implemented.
> +
> +Appendix A: Implementation of CLC ..	CLC-interfaces are partly implemented
> +					for SA-aware components. 
> +					The terminate operation,
> +					saAmfComponentTerminateCallback(),
> +					is never called.
> +					No CLC-interfaces are implemented for
> +					any other type of component.
> +
> +Appendix B: API functions in Unre....	AMF does not verify that the rules
> +					described are fulfilled.
> +
> +
> +
> +Which functions of the AMF API is currently NOT implemented ?
> +-------------------------------------------------------------
> +
> +Function					Deviation
> +--------					---------
> +saAmfComponentUnregister()			Is implemented in the library
> +						but not in aisexec.
> +
> +saAmfPmStart()					Is implemented in the library
> +						but not in aisexec.
> +
> +saAmfPmStop()					Is implemented in the library
> +						but not in aisexec.
> +
> +saAmfHealthcheckStart()				This function takes a parameter
> +						of type SaAmfRecommendedRecoveryT.
> +						The value of this parameter is
> +						supposed to specify what kind of
> +						recovery AMF should execute if
> +						the component fails a health
> +						check. AMF does not read the
> +						value of this parameter but
> + 						instead always tries to recover
> +						the component by a component
> +						restart.
> +
> +void (*SaAmfCSIRemoveCallbackT)()		AMF will never make a call-back
> +						to this function.
> +void 
> +(*SaAmfComponentTerminateCallbackT)()		AMF will never make a call-back
> +						to this function.
> +void 
> +(*SaAmfProxiedComponentInstantiateCallbackT)()	AMF will never make a call-back
> +						to this function.
> +void 
> +(*SaAmfProxiedComponentCleanupCallbackT)()	AMF will never make a call-back
> +						to this function.
> +saAmfProtectionGroupTrack()			Is implemented in the library
> +						but not in aisexec.
> +
> +saAmfProtectionGroupTrackStop()			Is implemented in the library
> +						but not in aisexec.
> +
> +void (*SaAmfProtectionGroupTrackCallbackT)()	AMF will never make a call-back
> +						to this function.
> +
> +saAmfProtectionGroupNotificationFree()		Not implemented.
> +
> +saAmfComponentErrorReport()			This function takes a parameter
> +						of type SaAmfRecommendedRecoveryT.
> +						The value of this parameter is
> +						supposed to specify what kind of
> +						recovery AMF should execute if
> +						the component fails a health
> +						check. AMF does not read the
> +						value of this parameter but
> + 						instead always tries to recover
> +						the component by a component
> +						restart.
> +
> +saAmfComponentErrorClear()			Is implemented in the library
> +						but not in aisexec.
> +
> +
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Openais mailing list
> Openais at lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/openais




More information about the Openais mailing list