Android 12 init 子进程回收与服务重启分析 admin 2023-02-13 18:33:02 篇首语:本文由小编为大家整理,主要介绍了Android 12 init 子进程回收与服务重启分析相关的知识,希望对你有一定的参考价值。 文章托管在gitee上 Android Notes , 同步csdn 本文基于android12 分析 在init运行过程中,不可避免的会出现子进程或服务退出,需要做一些针对性处理: 对于已终止的子进程需要将其回收掉,防止产生僵尸进程对于非oneshot服务,需要重新将其拉起,防止异常退出。 处理子进程退出 在init中通过监听信号 SIGCHLD,来获取子进程终止事件,然后做一些针对性动作。 InstallSignalFdHandler 初始化信号处理器,注册子进程终止的监听 /// @system/core/init/init.cppstatic void InstallSignalFdHandler(Epoll* epoll) // Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving // SIGCHLD when a child process stops or continues (b/77867680#comment9). const struct sigaction act .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP ; sigaction(SIGCHLD, &act, nullptr);// 添加flag ,不接收进程 stop/continue 事件 sigset_t mask; sigemptyset(&mask); sigaddset(&mask, SIGCHLD); if (!IsRebootCapable()) // 没有CAP_SYS_BOOT capability,不具备重启能力 // If init does not have the CAP_SYS_BOOT capability, it is running in a container. // In that case, receiving SIGTERM will cause the system to shut down. sigaddset(&mask, SIGTERM); // 添加SIGTERM到信号集 if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1) // block这些信号,与signalfd匹配使用 PLOG(FATAL) << "failed to block signals"; // Register a handler to unblock signals in the child processes. // UnblockSignals在fork返回之前,在子进程上下文中被执行,使得子进程不block这些信号 const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals); if (result != 0) LOG(FATAL) << "Failed to register a fork handler: " << strerror(result); signal_fd = signalfd(-1, &mask, SFD_CLOEXEC); // 创建fd,用于读取被block的信号 if (signal_fd == -1) PLOG(FATAL) << "failed to create signalfd"; // 通过 epoll 监听新的信号到来 if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd); !result.ok()) LOG(FATAL) << result.error(); UnblockSignals 在子进程执行该函数,即子进程默认是不阻塞这些信号的。 /// @system/core/init/init.cppstatic void UnblockSignals() const struct sigaction act .sa_handler = SIG_DFL ; sigaction(SIGCHLD, &act, nullptr); sigset_t mask; sigemptyset(&mask); sigaddset(&mask, SIGCHLD); sigaddset(&mask, SIGTERM); if (sigprocmask(SIG_UNBLOCK, &mask, nullptr) == -1) PLOG(FATAL) << "failed to unblock signals for PID " << getpid(); 当epoll监听到signal_fd有事件到来,即产生了相关信号,则会回调HandleSignalFd来处理 HandleSignalFd /// system/core/init/init.cppstatic void HandleSignalFd() signalfd_siginfo siginfo; // 从fd读取信号信息 ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo))); if (bytes_read != sizeof(siginfo)) PLOG(ERROR) << "Failed to read siginfo from signal_fd"; return; switch (siginfo.ssi_signo) case SIGCHLD: // 子进程终止事件 ReapAnyOutstandingChildren(); break; case SIGTERM: // 信号15,kill命令默认发送的信号 HandleSigtermSignal(siginfo); break; default: PLOG(ERROR) << "signal_fd: received unexpected signal " << siginfo.ssi_signo; break; 处理 SIGCHLD 会调用ReapAnyOutstandingChildren,它实现了所有终止子进程的回收 ReapAnyOutstandingChildren /// @system/core/init/sigchld_handler.cppvoid ReapAnyOutstandingChildren() while (ReapOneProcess() != 0) // 循环处理所有已终止的进程(调用exit或被信号杀死) ReapOneProcess 这个函数的作用如下: 调用waitid回收已经终止的进程打印进程死亡原因,被信号kill或者调用exit退出针对 service 调用其 Reap 函数,清理状态、处理重启及 onrestart 命令 /// @system/core/init/sigchld_handler.cppstatic pid_t ReapOneProcess() siginfo_t siginfo = ; // This returns a zombie pid or informs us that there are no zombies left to be reaped. // It does NOT reap the pid; that is done below. if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) PLOG(ERROR) << "waitid failed"; return 0; auto pid = siginfo.si_pid; if (pid == 0) return 0; // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid // whenever the function returns from this point forward. // We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we // want the pid to remain valid throughout that (and potentially future) usages. auto reaper = make_scope_guard([pid] TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); ); std::string name; std::string wait_string; Service* service = nullptr; if (SubcontextChildReap(pid)) // 处理Subcontext进程退出,非正在关机中会重启该进程 name = "Subcontext"; else // 判断该进程是否是某个服务,比如surfaceflinger service = ServiceList::GetInstance().FindService(pid, &Service::pid); if (service) // 服务存在 name = StringPrintf("Service "%s" (pid %d)", service->name().c_str(), pid); if (service->flags() & SVC_EXEC) // 通过"exec" or "exec_start" 启动的可执行程序进程 auto exec_duration = boot_clock::now() - service->time_started(); auto exec_duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration).count(); wait_string = StringPrintf(" waiting took %f seconds", exec_duration_ms / 1000.0f); else if (service->flags() & SVC_ONESHOT) // 一次性的服务 auto exec_duration = boot_clock::now() - service->time_started(); auto exec_duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration) .count(); wait_string = StringPrintf(" oneshot service took %f seconds in background", exec_duration_ms / 1000.0f); else name = StringPrintf("Untracked pid %d", pid); // 非服务进程,未追踪的进程退出 if (siginfo.si_code == CLD_EXITED) // 进程 exit LOG(INFO) << name << " exited with status " << siginfo.si_status << wait_string; else // 进程被 kill LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string; if (!service) return pid; service->Reap(siginfo); // 调用Reap,做清理工作,并重启非oneshot的服务 if (service->flags() & SVC_TEMPORARY) // 通过"exec" 启动的服务 ServiceList::GetInstance().RemoveService(*service); return pid; Service::Reap kill进程组所有进程清理所有socket资源相关文件回调reap_callbacks_,比如之前设置的启动失败回调critical服务持续保持退出(4分钟大于4次),则重启到BootLoader标记服务SVC_RESTARTING,在HandleProcessActions中重启服务执行onrestart命令通知服务状态改变 /// @system/core/init/service.cpp void Service::Reap(const siginfo_t& siginfo) if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) // 不是一次性的或者需要重启的 KillProcessGroup(SIGKILL, false); // 服务死亡,杀死其进程组所有进程, 第二个参数表示是否report_oneshot else // Legacy behavior from ~2007 until Android R: this else branch did not exist and we did not // kill the process group in this case. if (SelinuxGetVendorAndroidVersion() >= __ANDROID_API_R__) // 杀死oneshot服务的进程组 // The new behavior in Android R is to kill these process groups in all cases. The // "true" parameter instructions KillProcessGroup() to report a warning message where it // detects a difference in behavior has occurred. KillProcessGroup(SIGKILL, true); // Remove any socket resources we may have created. for (const auto& socket : sockets_) // 清理该服务创建的socket 路径文件 auto path = ANDROID_SOCKET_DIR "/" + socket.name; unlink(path.c_str()); for (const auto& f : reap_callbacks_) // 执行通过 AddReapCallback 添加的reap操作的回调 f(siginfo); if ((siginfo.si_code != CLD_EXITED || siginfo.si_status != 0) && on_failure_reboot_target_) LOG(ERROR) << "Service with "reboot_on_failure" option failed, shutting down system."; trigger_shutdown(*on_failure_reboot_target_);// 带有reboot_on_failure选项的服务,非正常退出则会触发关机 if (flags_ & SVC_EXEC) UnSetExec(); // 重置 is_exec_service_running_ flag if (flags_ & SVC_TEMPORARY) return; // 临时oneshot服务,返回 pid_ = 0; flags_ &= (~SVC_RUNNING); start_order_ = 0; // Oneshot processes go into the disabled state on exit, // except when manually restarted. // 标记为 SVC_RESTART 的,是需要重启服务的。在StopOrReset函数先kill进程,然后标记为SVC_RESTART,到回收后则进行重启 if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET)) flags_ |= SVC_DISABLED; // oneshot服务置disabled状态 // Disabled and reset processes do not get restarted automatically. if (flags_ & (SVC_DISABLED | SVC_RESET)) // disabled 和 reset 状态服务不重启 NotifyStateChange("stopped"); return; #if INIT_FULL_SOURCES static bool is_apex_updatable = android::sysprop::ApexProperties::updatable().value_or(false); #else static bool is_apex_updatable = false; #endif const bool is_process_updatable = !use_bootstrap_ns_ && is_apex_updatable; // If we crash > 4 times in "fatal_crash_window_" minutes or before boot_completed, // reboot into bootloader or set crashing property boot_clock::time_point now = boot_clock::now(); // critica或可更新(如apex) 并且 服务未标记要重启 if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART)) bool boot_completed = GetBoolProperty("sys.boot_completed", false); if (now < time_crashed_ + fatal_crash_window_ || !boot_completed) // 在窗口时间内 或 开机流程未完成 if (++crash_count_ > 4) auto exit_reason = boot_completed ? "in " + std::to_string(fatal_crash_window_.count()) + " minutes" : "before boot completed"; if (flags_ & SVC_CRITICAL) // critical 服务在窗口时间(4分钟内)或开机完成前 crash超过4次,则会重启到 bootloader if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false)) // Aborts into "atal_reboot_target_". SetFatalRebootTarget(fatal_reboot_target_); LOG(FATAL) << "critical process "" << name_ << "" exited 4 times " << exit_reason; else // 非 critical 服务只有一个打印,然后记录到属性 LOG(ERROR) << "process with updatable components "" << name_ << "" exited 4 times " << exit_reason; // Notifies update_verifier and apexd SetProperty("sys.init.updatable_crashing_process_name", name_); SetProperty("sys.init.updatable_crashing", "1"); else // 重新记录时间和次数 time_crashed_ = now; crash_count_ = 1; flags_ &= (~SVC_RESTART); flags_ |= SVC_RESTARTING; // 注意此处标记,是服务重启的关键 // Execute all onrestart commands for this service. onrestart_.ExecuteAllCommands(); // 执行所有 onrestart 命令, 在rc里面配置的 NotifyStateChange("restarting"); return; Service::KillProcessGroup void Service::KillProcessGroup(int signal, bool report_oneshot) // If we"ve already seen a successful result from killProcessGroup*(), then we have removed // the cgroup already and calling these functions a second time will simply result in an error. // This is true regardless of which signal was sent. // These functions handle their own logging, so no additional logging is needed. if (!process_cgroup_empty_) LOG(INFO) << "Sending signal " << signal << " to service "" << name_ << "" (pid " << pid_ << ") process group..."; int max_processes = 0; int r; if (signal == SIGTERM) r = killProcessGroupOnce(proc_attr_.uid, pid_, signal, &max_processes); else r = killProcessGroup(proc_attr_.uid, pid_, signal, &max_processes); if (report_oneshot && max_processes > 0) LOG(WARNING) << "Killed " << max_processes << " additional processes from a oneshot process group for service "" << name_ << "". This is new behavior, previously child processes would not be killed in " "this case."; if (r == 0) process_cgroup_empty_ = true; if (oom_score_adjust_ != DEFAULT_OOM_SCORE_ADJUST) LmkdUnregister(name_, pid_); // 从lmkd移除进程信息 上面两个killProcessGroup实现如下: /// @system/core/libprocessgroup/processgroup.cppint killProcessGroup(uid_t uid, int initialPid, int signal, int* max_processes) // 内部调用DoKillProcessGroupOnce去kill进程组 return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/, max_processes);int killProcessGroupOnce(uid_t uid, int initialPid, int signal, int* max_processes) return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/, max_processes); 关于cgroup配置可参见 cgroups.json /// @system/core/libprocessgroup/profiles/cgroups.json "Cgroups": [ "Controller": "blkio", "Path": "/dev/blkio", "Mode": "0755", "UID": "system", "GID": "system" , "Controller": "cpu", "Path": "/dev/cpuctl", "Mode": "0755", "UID": "system", "GID": "system" , "Controller": "cpuset", "Path": "/dev/cpuset", "Mode": "0755", "UID": "system", "GID": "system" , "Controller": "memory", "Path": "/dev/memcg", "Mode": "0700", 以上是关于Android 12 init 子进程回收与服务重启分析的主要内容,如果未能解决你的问题,请参考以下文章 win32day10-组合框/列表框/滚动条/控件的自绘制 kali2021设置中文 您可能还会对下面的文章感兴趣: 相关文章 浏览器打不开网址提示“ERR_CONNECTION_TIMED_OUT”错误代码的解决方法 如何安装ocx控件 VMware的虚拟机为啥ip地址老是自动变化 vbyone和EDP区别 linux/debian到底怎么重启和关机 苹果平板键盘被弄到上方去了,如何调回正常? 机器学习常用距离度量 如何查看kindle型号