场景: 我们需要访问若干机器进行状态检查,文件拷贝之类的操作。
分析:我们可以使用循环的方式来进行逐个操作。如果每个操作耗时很短的话还可以接受。假如每个操作耗时很长,这样单个处理的方式就显得效率低下。因为每个节点的操作是相互独立的,我们可以使用多进程的方式并行处理。为了防止本地机器的资源耗费很大,我们又期望对并行的个数进行限制。在操作过程中,还可能出现操作卡住的情况,因此我们还需要限制操作的时间,如果超过这个限定的时间之后我们就认为操作失败,可能需要人工干预。
实现: 基于上述描述,我们可以用perl来实现上述功能。
use warnings; use strict; use File::Spec; use FindBin qw($Bin); use Net::OpenSSH; use POSIX ":sys_wait_h"; #限制同时操作的节点数目 my $host_number_simultaneity = 6; #设置操作的最大时间 my $install_seconds_limiation = 1800; #机器的登陆用户名和密码 my $login_user = "root"; my $login_passwd = "312"; my $ssh_ops = {user => $login_user, password => $login_passwd, master_opts => [-o => "UserKnownHostsFile=/dev/null", -o => "StrictHostKeyChecking=no"]}; #当前目录下记录操作的日志文件 my $log_file='install.log'; open(LOG_FILENAME, '>', "$log_file") || die "\nCan't open log file $log_file to record results of opeation !! "; select LOG_FILENAME; $| = 1; chomp(my $time_stamp=`date`); print "step 0: operation begin at $time_stamp\n"; #获取操作节点的IP地址,用户这里可以自行处理 my @all_host_ip = qw/192.168.10.10 192.168.10.12 192.168.10.14/; say LOG_FILENAME "all the hosts ip are as follows: @all_host_ip"; my $host_number_total = @all_host_ip; my $host_number_ongoing = 0; my $host_number_finished = 0; my $host_number_timeout = 0; # (key : pid, value : host ip) my %pid_to_host = (); # (key : pid, value : start time-seconds) my %pid_to_start_time = (); $SIG{CHLD} = sub{$host_number_ongoing--}; say LOG_FILENAME "there are $host_number_total hosts in all"; for (my $index=0; $index<$host_number_total;$index++) { my $pid = fork(); if ( !defined($pid) ) { say LOG_FILENAME "Error in fork: $!. This is going to deal with host $all_host_ip[$index]"; print "Error in fork: $!. This is going to deal with host $all_host_ip[$index]\n"; exit 1; } if ( $pid == 0 ) { say LOG_FILENAME "index: $index -- This is going to begin operation on host $all_host_ip[$index]"; print "index: $index -- This is going to begin operation on host $all_host_ip[$index]\n"; #对每个节点进行操作处理 &operation_host($all_host_ip[$index]); say LOG_FILENAME "index: $index -- This is the end of installation on host $all_host_ip[$index]"; exit 0; } $pid_to_host{$pid} = $all_host_ip[$index]; $pid_to_start_time{$pid} = time; $host_number_ongoing++; #如果正在操作的节点数量超过限制的数量,就等待30秒,再检查是否有操作完毕 while ( $host_number_ongoing >= $host_number_simultaneity ) { sleep(30); } } my $collect_pid; while ( ($host_number_finished+$host_number_timeout) != $host_number_total ) { if ( ($collect_pid = waitpid(-1,WNOHANG)) > 0 ) { #操作结束之后,从哈希中将记录删除 $host_number_finished++; chomp($time_stamp=`date`); say LOG_FILENAME "operation on host $pid_to_host{$collect_pid} finished at $time_stamp "; if ( exists $pid_to_host{$collect_pid} ) { delete $pid_to_host{$collect_pid}; } if ( exists $pid_to_start_time{$collect_pid} ) { delete $pid_to_start_time{$collect_pid}; } } #检查没有结束的操作是否超时 my $current_time = time; $host_number_timeout = 0; while ( my ($key, $value) = each %pid_to_start_time ) { if ( ($current_time - $pid_to_start_time{$key}) >= $install_seconds_limiation ) { say LOG_FILENAME "operation on host $pid_to_host{$key} are time out yet"; $host_number_timeout++; } } chomp($time_stamp=`date`); say LOG_FILENAME "sleep 30s to check again...current time: $time_stamp"; print "sleep 30s to check again...current time: $time_stamp\n"; sleep(30); } # 将超时的节点ip信息打印处理 if ( $host_number_total > 0 ) { while ( my ($key, $value) = each %pid_to_host ) { say LOG_FILENAME "operation on host $pid_to_host{$key} are time out finally!!"; print "operation on host $pid_to_host{$key} are time out finally!!\n"; } } close LOG_FILENAME; exit 0;