地方エンジニアの学習日記

興味ある技術の雑なメモだったりを書いてくブログ。たまに日記とガジェット紹介。

【Linux】プロセス終了時のソケットを閉じる処理を見ていく

プロセスの終了はdo_exit関数で行われます。明示的にexitシステムコールを 呼び出したとき以外にも、シグナルを受けて死ぬときなどにも呼び出されます。 do_exit関数ではリソースの解放も行っており今回読みたいソケットを閉じる処理も呼び出していきます。とりあえず見直せる用のメモです。

void __noreturn do_exit(long code)
{
    struct task_struct *tsk = current;
    int group_dead;

    synchronize_group_exit(tsk, code);

    WARN_ON(tsk->plug);

    kcov_task_exit(tsk);
    kmsan_task_exit(tsk);

    coredump_task_exit(tsk);
    ptrace_event(PTRACE_EVENT_EXIT, code);

    validate_creds_for_do_exit(tsk);

    io_uring_files_cancel();
    exit_signals(tsk);  /* sets PF_EXITING */

    /* sync mm's RSS info before statistics gathering */
    if (tsk->mm)
        sync_mm_rss(tsk->mm);
    acct_update_integrals(tsk);
    group_dead = atomic_dec_and_test(&tsk->signal->live);
    if (group_dead) {
        /*
        * If the last thread of global init has exited, panic
        * immediately to get a useable coredump.
        */
        if (unlikely(is_global_init(tsk)))
            panic("Attempted to kill init! exitcode=0x%08x\n",
                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
        hrtimer_cancel(&tsk->signal->real_timer);
        exit_itimers(tsk);
#endif
        if (tsk->mm)
            setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
    }
    acct_collect(code, group_dead);
    if (group_dead)
        tty_audit_exit();
    audit_free(tsk);

    tsk->exit_code = code;
    taskstats_exit(tsk, group_dead);

    exit_mm();

    if (group_dead)
        acct_process();
    trace_sched_process_exit(tsk);

    exit_sem(tsk);
    exit_shm(tsk);
    exit_files(tsk);  // ここ
    exit_fs(tsk);
    if (group_dead)
        disassociate_ctty(1);
    exit_task_namespaces(tsk);
    exit_task_work(tsk);
    exit_thread(tsk);

    /*
    * Flush inherited counters to the parent - before the parent
    * gets woken up by child-exit notifications.
    *
    * because of cgroup mode, must be called before cgroup_exit()
    */
    perf_event_exit_task(tsk);

    sched_autogroup_exit_task(tsk);
    cgroup_exit(tsk);

    /*
    * FIXME: do that only when needed, using sched_exit tracepoint
    */
    flush_ptrace_hw_breakpoint(tsk);

    exit_tasks_rcu_start();
    exit_notify(tsk, group_dead);
    proc_exit_connector(tsk);
    mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
    if (unlikely(current->pi_state_cache))
        kfree(current->pi_state_cache);
#endif
    /*
    * Make sure we are holding no locks:
    */
    debug_check_no_locks_held();

    if (tsk->io_context)
        exit_io_context(tsk);

    if (tsk->splice_pipe)
        free_pipe_info(tsk->splice_pipe);

    if (tsk->task_frag.page)
        put_page(tsk->task_frag.page);

    validate_creds_for_do_exit(tsk);
    exit_task_stack_account(tsk);

    check_stack_usage();
    preempt_disable();
    if (tsk->nr_dirtied)
        __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
    exit_rcu();
    exit_tasks_rcu_finish();

    lockdep_free_task(tsk);
    do_task_dead();
}
void exit_files(struct task_struct *tsk)
{
    struct files_struct * files = tsk->files;

    if (files) {
        task_lock(tsk);
        tsk->files = NULL;
        task_unlock(tsk);
        put_files_struct(files);
    }
}
void put_files_struct(struct files_struct *files)
{
    if (atomic_dec_and_test(&files->count)) {
        struct fdtable *fdt = close_files(files);

        /* free the arrays if they are not embedded */
        if (fdt != &files->fdtab)
            __free_fdtable(fdt);
        kmem_cache_free(files_cachep, files);
    }
}
static struct fdtable *close_files(struct files_struct * files)
{
    /*
    * It is safe to dereference the fd table without RCU or
    * ->file_lock because this is the last reference to the
    * files structure.
    */
    struct fdtable *fdt = rcu_dereference_raw(files->fdt);
    unsigned int i, j = 0;

    for (;;) {
        unsigned long set;
        i = j * BITS_PER_LONG;
        if (i >= fdt->max_fds)
            break;
        set = fdt->open_fds[j++];
        while (set) {
            if (set & 1) {
                struct file * file = xchg(&fdt->fd[i], NULL);
                if (file) {
                    filp_close(file, files);
                    cond_resched();
                }
            }
            i++;
            set >>= 1;
        }
    }

    return fdt;
}
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op->flush)
        retval = filp->f_op->flush(filp, id);

    if (likely(!(filp->f_mode & FMODE_PATH))) {
        dnotify_flush(filp, id);
        locks_remove_posix(filp, id);
    }
    fput(filp);
    return retval;
}
void fput(struct file *file)
{
    if (atomic_long_dec_and_test(&file->f_count)) {
        struct task_struct *task = current;

        if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
            init_task_work(&file->f_rcuhead, ____fput);
            if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
                return;
            /*
            * After this task has run exit_task_work(),
            * task_work_add() will fail.  Fall through to delayed
            * fput to avoid leaking *file.
            */
        }

        if (llist_add(&file->f_llist, &delayed_fput_list))
            schedule_delayed_work(&delayed_fput_work, 1);
    }
}
static void ____fput(struct callback_head *work)
{
    __fput(container_of(work, struct file, f_rcuhead));
}
static void __fput(struct file *file)
{
    struct dentry *dentry = file->f_path.dentry;
    struct vfsmount *mnt = file->f_path.mnt;
    struct inode *inode = file->f_inode;
    fmode_t mode = file->f_mode;

    if (unlikely(!(file->f_mode & FMODE_OPENED)))
        goto out;

    might_sleep();

    fsnotify_close(file);
    /*
    * The function eventpoll_release() should be the first called
    * in the file cleanup chain.
    */
    eventpoll_release(file);
    locks_remove_file(file);

    ima_file_free(file);
    if (unlikely(file->f_flags & FASYNC)) {
        if (file->f_op->fasync)
            file->f_op->fasync(-1, file, 0);
    }
    if (file->f_op->release)
        file->f_op->release(inode, file);
    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
             !(mode & FMODE_PATH))) {
        cdev_put(inode->i_cdev);
    }
    fops_put(file->f_op);
    put_pid(file->f_owner.pid);
    put_file_access(file);
    dput(dentry);
    if (unlikely(mode & FMODE_NEED_UNMOUNT))
        dissolve_on_fput(mnt);
    mntput(mnt);
out:
    file_free(file);
}
void tcp_close(struct sock *sk, long timeout)
{
    lock_sock(sk);
    __tcp_close(sk, timeout);
    release_sock(sk);
    sock_put(sk);
}
void __tcp_close(struct sock *sk, long timeout)
{
    struct sk_buff *skb;
    int data_was_unread = 0;
    int state;

    sk->sk_shutdown = SHUTDOWN_MASK;

    if (sk->sk_state == TCP_LISTEN) {
        tcp_set_state(sk, TCP_CLOSE);

        /* Special case. */
        inet_csk_listen_stop(sk);

        goto adjudge_to_death;
    }

    /*  We need to flush the recv. buffs.  We do this only on the
    *  descriptor close, not protocol-sourced closes, because the
    *  reader process may not have drained the data yet!
    */
    while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
        u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;

        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
            len--;
        data_was_unread += len;
        __kfree_skb(skb);
    }

    /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
    if (sk->sk_state == TCP_CLOSE)
        goto adjudge_to_death;

    /* As outlined in RFC 2525, section 2.17, we send a RST here because
    * data was lost. To witness the awful effects of the old behavior of
    * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
    * GET in an FTP client, suspend the process, wait for the client to
    * advertise a zero window, then kill -9 the FTP client, wheee...
    * Note: timeout is always zero in such a case.
    */
    if (unlikely(tcp_sk(sk)->repair)) {
        sk->sk_prot->disconnect(sk, 0);
    } else if (data_was_unread) {
        /* Unread data was tossed, zap the connection. */
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
        tcp_set_state(sk, TCP_CLOSE);
        tcp_send_active_reset(sk, sk->sk_allocation);
    } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
        /* Check zero linger _after_ checking for unread data. */
        sk->sk_prot->disconnect(sk, 0);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
    } else if (tcp_close_state(sk)) {
        /* We FIN if the application ate all the data before
        * zapping the connection.
        */

        /* RED-PEN. Formally speaking, we have broken TCP state
        * machine. State transitions:
        *
        * TCP_ESTABLISHED -> TCP_FIN_WAIT1
        * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
        * TCP_CLOSE_WAIT -> TCP_LAST_ACK
        *
        * are legal only when FIN has been sent (i.e. in window),
        * rather than queued out of window. Purists blame.
        *
        * F.e. "RFC state" is ESTABLISHED,
        * if Linux state is FIN-WAIT-1, but FIN is still not sent.
        *
        * The visible declinations are that sometimes
        * we enter time-wait state, when it is not required really
        * (harmless), do not send active resets, when they are
        * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
        * they look as CLOSING or LAST_ACK for Linux)
        * Probably, I missed some more holelets.
        *                      --ANK
        * XXX (TFO) - To start off we don't support SYN+ACK+FIN
        * in a single packet! (May consider it later but will
        * probably need API support or TCP_CORK SYN-ACK until
        * data is written and socket is closed.)
        */
        tcp_send_fin(sk);
    }

    sk_stream_wait_close(sk, timeout);

adjudge_to_death:
    state = sk->sk_state;
    sock_hold(sk);
    sock_orphan(sk);

    local_bh_disable();
    bh_lock_sock(sk);
    /* remove backlog if any, without releasing ownership. */
    __release_sock(sk);

    this_cpu_inc(tcp_orphan_count);

    /* Have we already been destroyed by a softirq or backlog? */
    if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
        goto out;

    /* This is a (useful) BSD violating of the RFC. There is a
    *  problem with TCP as specified in that the other end could
    *  keep a socket open forever with no application left this end.
    *  We use a 1 minute timeout (about the same as BSD) then kill
    *  our end. If they send after that then tough - BUT: long enough
    *  that we won't make the old 4*rto = almost no time - whoops
    *  reset mistake.
    *
    *  Nope, it was not mistake. It is really desired behaviour
    *  f.e. on http servers, when such sockets are useless, but
    *  consume significant resources. Let's do it with special
    *  linger2 option.                 --ANK
    */

    if (sk->sk_state == TCP_FIN_WAIT2) {
        struct tcp_sock *tp = tcp_sk(sk);
        if (tp->linger2 < 0) {
            tcp_set_state(sk, TCP_CLOSE);
            tcp_send_active_reset(sk, GFP_ATOMIC);
            __NET_INC_STATS(sock_net(sk),
                    LINUX_MIB_TCPABORTONLINGER);
        } else {
            const int tmo = tcp_fin_time(sk);

            if (tmo > TCP_TIMEWAIT_LEN) {
                inet_csk_reset_keepalive_timer(sk,
                        tmo - TCP_TIMEWAIT_LEN);
            } else {
                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                goto out;
            }
        }
    }
    if (sk->sk_state != TCP_CLOSE) {
        if (tcp_check_oom(sk, 0)) {
            tcp_set_state(sk, TCP_CLOSE);
            tcp_send_active_reset(sk, GFP_ATOMIC);
            __NET_INC_STATS(sock_net(sk),
                    LINUX_MIB_TCPABORTONMEMORY);
        } else if (!check_net(sock_net(sk))) {
            /* Not possible to send reset; just close */
            tcp_set_state(sk, TCP_CLOSE);
        }
    }

    if (sk->sk_state == TCP_CLOSE) {
        struct request_sock *req;

        req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
                        lockdep_sock_is_held(sk));
        /* We could get here with a non-NULL req if the socket is
        * aborted (e.g., closed with unread data) before 3WHS
        * finishes.
        */
        if (req)
            reqsk_fastopen_remove(sk, req, false);
        inet_csk_destroy_sock(sk);
    }
    /* Otherwise, socket is reprieved until protocol close. */

out:
    bh_unlock_sock(sk);
    local_bh_enable();
}
void tcp_send_fin(struct sock *sk)
{
    struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
    struct tcp_sock *tp = tcp_sk(sk);

    /* Optimization, tack on the FIN if we have one skb in write queue and
    * this skb was not yet sent, or we are under memory pressure.
    * Note: in the latter case, FIN packet will be sent after a timeout,
    * as TCP stack thinks it has already been transmitted.
    */
    tskb = tail;
    if (!tskb && tcp_under_memory_pressure(sk))
        tskb = skb_rb_last(&sk->tcp_rtx_queue);

    if (tskb) {
        TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
        TCP_SKB_CB(tskb)->end_seq++;
        tp->write_seq++;
        if (!tail) {
            /* This means tskb was already sent.
            * Pretend we included the FIN on previous transmit.
            * We need to set tp->snd_nxt to the value it would have
            * if FIN had been sent. This is because retransmit path
            * does not change tp->snd_nxt.
            */
            WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
            return;
        }
    } else {
        skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
        if (unlikely(!skb))
            return;

        INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
        skb_reserve(skb, MAX_TCP_HEADER);
        sk_forced_mem_schedule(sk, skb->truesize);
        /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
        tcp_init_nondata_skb(skb, tp->write_seq,
                     TCPHDR_ACK | TCPHDR_FIN);
        tcp_queue_skb(sk, skb);
    }
    __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}

参考

www.51cto.com