diff options
Diffstat (limited to 'kernel')
46 files changed, 1011 insertions, 585 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e794544f5e63..a871bf80fde1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,10 @@  #include <linux/kthread.h>  #include <linux/kernel.h>  #include <linux/syscalls.h> +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <linux/mutex.h> +#include <linux/gfp.h>  #include <linux/audit.h> @@ -90,13 +94,34 @@ static u32	audit_default;  /* If auditing cannot proceed, audit_failure selects what happens. */  static u32	audit_failure = AUDIT_FAIL_PRINTK; -/* - * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_portid contains - * the portid to use to send netlink messages to that process. +/* private audit network namespace index */ +static unsigned int audit_net_id; + +/** + * struct audit_net - audit private network namespace data + * @sk: communication socket   */ -int		audit_pid; -static __u32	audit_nlk_portid; +struct audit_net { +	struct sock *sk; +}; + +/** + * struct auditd_connection - kernel/auditd connection state + * @pid: auditd PID + * @portid: netlink portid + * @net: the associated network namespace + * @lock: spinlock to protect write access + * + * Description: + * This struct is RCU protected; you must either hold the RCU lock for reading + * or the included spinlock for writing. + */ +static struct auditd_connection { +	int pid; +	u32 portid; +	struct net *net; +	spinlock_t lock; +} auditd_conn;  /* If audit_rate_limit is non-zero, limit the rate of sending audit records   * to that number per second.  This prevents DoS attacks, but results in @@ -123,10 +148,6 @@ u32		audit_sig_sid = 0;  */  static atomic_t	audit_lost = ATOMIC_INIT(0); -/* The netlink socket. */ -static struct sock *audit_sock; -static unsigned int audit_net_id; -  /* Hash for inode-based rules */  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -192,6 +213,43 @@ struct audit_reply {  	struct sk_buff *skb;  }; +/** + * auditd_test_task - Check to see if a given task is an audit daemon + * @task: the task to check + * + * Description: + * Return 1 if the task is a registered audit daemon, 0 otherwise. + */ +int auditd_test_task(const struct task_struct *task) +{ +	int rc; + +	rcu_read_lock(); +	rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0); +	rcu_read_unlock(); + +	return rc; +} + +/** + * audit_get_sk - Return the audit socket for the given network namespace + * @net: the destination network namespace + * + * Description: + * Returns the sock pointer if valid, NULL otherwise.  The caller must ensure + * that a reference is held for the network namespace while the sock is in use. + */ +static struct sock *audit_get_sk(const struct net *net) +{ +	struct audit_net *aunet; + +	if (!net) +		return NULL; + +	aunet = net_generic(net, audit_net_id); +	return aunet->sk; +} +  static void audit_set_portid(struct audit_buffer *ab, __u32 portid)  {  	if (ab) { @@ -210,9 +268,7 @@ void audit_panic(const char *message)  			pr_err("%s\n", message);  		break;  	case AUDIT_FAIL_PANIC: -		/* test audit_pid since printk is always losey, why bother? */ -		if (audit_pid) -			panic("audit: %s\n", message); +		panic("audit: %s\n", message);  		break;  	}  } @@ -370,21 +426,60 @@ static int audit_set_failure(u32 state)  	return audit_do_config_change("audit_failure", &audit_failure, state);  } -/* - * For one reason or another this nlh isn't getting delivered to the userspace - * audit daemon, just send it to printk. +/** + * auditd_set - Set/Reset the auditd connection state + * @pid: auditd PID + * @portid: auditd netlink portid + * @net: auditd network namespace pointer + * + * Description: + * This function will obtain and drop network namespace references as + * necessary. + */ +static void auditd_set(int pid, u32 portid, struct net *net) +{ +	unsigned long flags; + +	spin_lock_irqsave(&auditd_conn.lock, flags); +	auditd_conn.pid = pid; +	auditd_conn.portid = portid; +	if (auditd_conn.net) +		put_net(auditd_conn.net); +	if (net) +		auditd_conn.net = get_net(net); +	else +		auditd_conn.net = NULL; +	spin_unlock_irqrestore(&auditd_conn.lock, flags); +} + +/** + * kauditd_print_skb - Print the audit record to the ring buffer + * @skb: audit record + * + * Whatever the reason, this packet may not make it to the auditd connection + * so write it via printk so the information isn't completely lost.   */  static void kauditd_printk_skb(struct sk_buff *skb)  {  	struct nlmsghdr *nlh = nlmsg_hdr(skb);  	char *data = nlmsg_data(nlh); -	if (nlh->nlmsg_type != AUDIT_EOE) { -		if (printk_ratelimit()) -			pr_notice("type=%d %s\n", nlh->nlmsg_type, data); -		else -			audit_log_lost("printk limit exceeded"); -	} +	if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) +		pr_notice("type=%d %s\n", nlh->nlmsg_type, data); +} + +/** + * kauditd_rehold_skb - Handle a audit record send failure in the hold queue + * @skb: audit record + * + * Description: + * This should only be used by the kauditd_thread when it fails to flush the + * hold queue. + */ +static void kauditd_rehold_skb(struct sk_buff *skb) +{ +	/* put the record back in the queue at the same place */ +	skb_queue_head(&audit_hold_queue, skb);  }  /** @@ -444,48 +539,143 @@ static void kauditd_retry_skb(struct sk_buff *skb)   * auditd_reset - Disconnect the auditd connection   *   * Description: - * Break the auditd/kauditd connection and move all the records in the retry - * queue into the hold queue in case auditd reconnects.  The audit_cmd_mutex - * must be held when calling this function. + * Break the auditd/kauditd connection and move all the queued records into the + * hold queue in case auditd reconnects.   */  static void auditd_reset(void)  {  	struct sk_buff *skb; -	/* break the connection */ -	if (audit_sock) { -		sock_put(audit_sock); -		audit_sock = NULL; -	} -	audit_pid = 0; -	audit_nlk_portid = 0; +	/* if it isn't already broken, break the connection */ +	rcu_read_lock(); +	if (auditd_conn.pid) +		auditd_set(0, 0, NULL); +	rcu_read_unlock(); -	/* flush all of the retry queue to the hold queue */ +	/* flush all of the main and retry queues to the hold queue */  	while ((skb = skb_dequeue(&audit_retry_queue)))  		kauditd_hold_skb(skb); +	while ((skb = skb_dequeue(&audit_queue))) +		kauditd_hold_skb(skb);  }  /** - * kauditd_send_unicast_skb - Send a record via unicast to auditd + * auditd_send_unicast_skb - Send a record via unicast to auditd   * @skb: audit record + * + * Description: + * Send a skb to the audit daemon, returns positive/zero values on success and + * negative values on failure; in all cases the skb will be consumed by this + * function.  If the send results in -ECONNREFUSED the connection with auditd + * will be reset.  This function may sleep so callers should not hold any locks + * where this would cause a problem.   */ -static int kauditd_send_unicast_skb(struct sk_buff *skb) +static int auditd_send_unicast_skb(struct sk_buff *skb)  {  	int rc; +	u32 portid; +	struct net *net; +	struct sock *sk; -	/* if we know nothing is connected, don't even try the netlink call */ -	if (!audit_pid) -		return -ECONNREFUSED; +	/* NOTE: we can't call netlink_unicast while in the RCU section so +	 *       take a reference to the network namespace and grab local +	 *       copies of the namespace, the sock, and the portid; the +	 *       namespace and sock aren't going to go away while we hold a +	 *       reference and if the portid does become invalid after the RCU +	 *       section netlink_unicast() should safely return an error */ -	/* get an extra skb reference in case we fail to send */ -	skb_get(skb); -	rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); -	if (rc >= 0) { -		consume_skb(skb); -		rc = 0; +	rcu_read_lock(); +	if (!auditd_conn.pid) { +		rcu_read_unlock(); +		rc = -ECONNREFUSED; +		goto err;  	} +	net = auditd_conn.net; +	get_net(net); +	sk = audit_get_sk(net); +	portid = auditd_conn.portid; +	rcu_read_unlock(); + +	rc = netlink_unicast(sk, skb, portid, 0); +	put_net(net); +	if (rc < 0) +		goto err;  	return rc; + +err: +	if (rc == -ECONNREFUSED) +		auditd_reset(); +	return rc; +} + +/** + * kauditd_send_queue - Helper for kauditd_thread to flush skb queues + * @sk: the sending sock + * @portid: the netlink destination + * @queue: the skb queue to process + * @retry_limit: limit on number of netlink unicast failures + * @skb_hook: per-skb hook for additional processing + * @err_hook: hook called if the skb fails the netlink unicast send + * + * Description: + * Run through the given queue and attempt to send the audit records to auditd, + * returns zero on success, negative values on failure.  It is up to the caller + * to ensure that the @sk is valid for the duration of this function. + * + */ +static int kauditd_send_queue(struct sock *sk, u32 portid, +			      struct sk_buff_head *queue, +			      unsigned int retry_limit, +			      void (*skb_hook)(struct sk_buff *skb), +			      void (*err_hook)(struct sk_buff *skb)) +{ +	int rc = 0; +	struct sk_buff *skb; +	static unsigned int failed = 0; + +	/* NOTE: kauditd_thread takes care of all our locking, we just use +	 *       the netlink info passed to us (e.g. sk and portid) */ + +	while ((skb = skb_dequeue(queue))) { +		/* call the skb_hook for each skb we touch */ +		if (skb_hook) +			(*skb_hook)(skb); + +		/* can we send to anyone via unicast? */ +		if (!sk) { +			if (err_hook) +				(*err_hook)(skb); +			continue; +		} + +		/* grab an extra skb reference in case of error */ +		skb_get(skb); +		rc = netlink_unicast(sk, skb, portid, 0); +		if (rc < 0) { +			/* fatal failure for our queue flush attempt? */ +			if (++failed >= retry_limit || +			    rc == -ECONNREFUSED || rc == -EPERM) { +				/* yes - error processing for the queue */ +				sk = NULL; +				if (err_hook) +					(*err_hook)(skb); +				if (!skb_hook) +					goto out; +				/* keep processing with the skb_hook */ +				continue; +			} else +				/* no - requeue to preserve ordering */ +				skb_queue_head(queue, skb); +		} else { +			/* it worked - drop the extra reference and continue */ +			consume_skb(skb); +			failed = 0; +		} +	} + +out: +	return (rc >= 0 ? 0 : rc);  }  /* @@ -493,16 +683,19 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb)   * @skb: audit record   *   * Description: - * This function doesn't consume an skb as might be expected since it has to - * copy it anyways. + * Write a multicast message to anyone listening in the initial network + * namespace.  This function doesn't consume an skb as might be expected since + * it has to copy it anyways.   */  static void kauditd_send_multicast_skb(struct sk_buff *skb)  {  	struct sk_buff *copy; -	struct audit_net *aunet = net_generic(&init_net, audit_net_id); -	struct sock *sock = aunet->nlsk; +	struct sock *sock = audit_get_sk(&init_net);  	struct nlmsghdr *nlh; +	/* NOTE: we are not taking an additional reference for init_net since +	 *       we don't have to worry about it going away */ +  	if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))  		return; @@ -526,149 +719,79 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)  }  /** - * kauditd_wake_condition - Return true when it is time to wake kauditd_thread - * - * Description: - * This function is for use by the wait_event_freezable() call in - * kauditd_thread(). + * kauditd_thread - Worker thread to send audit records to userspace + * @dummy: unused   */ -static int kauditd_wake_condition(void) -{ -	static int pid_last = 0; -	int rc; -	int pid = audit_pid; - -	/* wake on new messages or a change in the connected auditd */ -	rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); -	if (rc) -		pid_last = pid; - -	return rc; -} -  static int kauditd_thread(void *dummy)  {  	int rc; -	int auditd = 0; -	int reschedule = 0; -	struct sk_buff *skb; -	struct nlmsghdr *nlh; +	u32 portid = 0; +	struct net *net = NULL; +	struct sock *sk = NULL;  #define UNICAST_RETRIES 5 -#define AUDITD_BAD(x,y) \ -	((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) - -	/* NOTE: we do invalidate the auditd connection flag on any sending -	 * errors, but we only "restore" the connection flag at specific places -	 * in the loop in order to help ensure proper ordering of audit -	 * records */  	set_freezable();  	while (!kthread_should_stop()) { -		/* NOTE: possible area for future improvement is to look at -		 *       the hold and retry queues, since only this thread -		 *       has access to these queues we might be able to do -		 *       our own queuing and skip some/all of the locking */ - -		/* NOTE: it might be a fun experiment to split the hold and -		 *       retry queue handling to another thread, but the -		 *       synchronization issues and other overhead might kill -		 *       any performance gains */ +		/* NOTE: see the lock comments in auditd_send_unicast_skb() */ +		rcu_read_lock(); +		if (!auditd_conn.pid) { +			rcu_read_unlock(); +			goto main_queue; +		} +		net = auditd_conn.net; +		get_net(net); +		sk = audit_get_sk(net); +		portid = auditd_conn.portid; +		rcu_read_unlock();  		/* attempt to flush the hold queue */ -		while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { -			rc = kauditd_send_unicast_skb(skb); -			if (rc) { -				/* requeue to the same spot */ -				skb_queue_head(&audit_hold_queue, skb); - -				auditd = 0; -				if (AUDITD_BAD(rc, reschedule)) { -					mutex_lock(&audit_cmd_mutex); -					auditd_reset(); -					mutex_unlock(&audit_cmd_mutex); -					reschedule = 0; -				} -			} else -				/* we were able to send successfully */ -				reschedule = 0; +		rc = kauditd_send_queue(sk, portid, +					&audit_hold_queue, UNICAST_RETRIES, +					NULL, kauditd_rehold_skb); +		if (rc < 0) { +			sk = NULL; +			auditd_reset(); +			goto main_queue;  		}  		/* attempt to flush the retry queue */ -		while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { -			rc = kauditd_send_unicast_skb(skb); -			if (rc) { -				auditd = 0; -				if (AUDITD_BAD(rc, reschedule)) { -					kauditd_hold_skb(skb); -					mutex_lock(&audit_cmd_mutex); -					auditd_reset(); -					mutex_unlock(&audit_cmd_mutex); -					reschedule = 0; -				} else -					/* temporary problem (we hope), queue -					 * to the same spot and retry */ -					skb_queue_head(&audit_retry_queue, skb); -			} else -				/* we were able to send successfully */ -				reschedule = 0; +		rc = kauditd_send_queue(sk, portid, +					&audit_retry_queue, UNICAST_RETRIES, +					NULL, kauditd_hold_skb); +		if (rc < 0) { +			sk = NULL; +			auditd_reset(); +			goto main_queue;  		} -		/* standard queue processing, try to be as quick as possible */ -quick_loop: -		skb = skb_dequeue(&audit_queue); -		if (skb) { -			/* setup the netlink header, see the comments in -			 * kauditd_send_multicast_skb() for length quirks */ -			nlh = nlmsg_hdr(skb); -			nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - -			/* attempt to send to any multicast listeners */ -			kauditd_send_multicast_skb(skb); - -			/* attempt to send to auditd, queue on failure */ -			if (auditd) { -				rc = kauditd_send_unicast_skb(skb); -				if (rc) { -					auditd = 0; -					if (AUDITD_BAD(rc, reschedule)) { -						mutex_lock(&audit_cmd_mutex); -						auditd_reset(); -						mutex_unlock(&audit_cmd_mutex); -						reschedule = 0; -					} +main_queue: +		/* process the main queue - do the multicast send and attempt +		 * unicast, dump failed record sends to the retry queue; if +		 * sk == NULL due to previous failures we will just do the +		 * multicast send and move the record to the retry queue */ +		rc = kauditd_send_queue(sk, portid, &audit_queue, 1, +					kauditd_send_multicast_skb, +					kauditd_retry_skb); +		if (sk == NULL || rc < 0) +			auditd_reset(); +		sk = NULL; -					/* move to the retry queue */ -					kauditd_retry_skb(skb); -				} else -					/* everything is working so go fast! */ -					goto quick_loop; -			} else if (reschedule) -				/* we are currently having problems, move to -				 * the retry queue */ -				kauditd_retry_skb(skb); -			else -				/* dump the message via printk and hold it */ -				kauditd_hold_skb(skb); -		} else { -			/* we have flushed the backlog so wake everyone */ -			wake_up(&audit_backlog_wait); +		/* drop our netns reference, no auditd sends past this line */ +		if (net) { +			put_net(net); +			net = NULL; +		} -			/* if everything is okay with auditd (if present), go -			 * to sleep until there is something new in the queue -			 * or we have a change in the connected auditd; -			 * otherwise simply reschedule to give things a chance -			 * to recover */ -			if (reschedule) { -				set_current_state(TASK_INTERRUPTIBLE); -				schedule(); -			} else -				wait_event_freezable(kauditd_wait, -						     kauditd_wake_condition()); +		/* we have processed all the queues so wake everyone */ +		wake_up(&audit_backlog_wait); -			/* update the auditd connection status */ -			auditd = (audit_pid ? 1 : 0); -		} +		/* NOTE: we want to wake up if there is anything on the queue, +		 *       regardless of if an auditd is connected, as we need to +		 *       do the multicast send and rotate records from the +		 *       main queue to the retry/hold queues */ +		wait_event_freezable(kauditd_wait, +				     (skb_queue_len(&audit_queue) ? 1 : 0));  	}  	return 0; @@ -678,17 +801,16 @@ int audit_send_list(void *_dest)  {  	struct audit_netlink_list *dest = _dest;  	struct sk_buff *skb; -	struct net *net = dest->net; -	struct audit_net *aunet = net_generic(net, audit_net_id); +	struct sock *sk = audit_get_sk(dest->net);  	/* wait for parent to finish and send an ACK */  	mutex_lock(&audit_cmd_mutex);  	mutex_unlock(&audit_cmd_mutex);  	while ((skb = __skb_dequeue(&dest->q)) != NULL) -		netlink_unicast(aunet->nlsk, skb, dest->portid, 0); +		netlink_unicast(sk, skb, dest->portid, 0); -	put_net(net); +	put_net(dest->net);  	kfree(dest);  	return 0; @@ -722,16 +844,15 @@ out_kfree_skb:  static int audit_send_reply_thread(void *arg)  {  	struct audit_reply *reply = (struct audit_reply *)arg; -	struct net *net = reply->net; -	struct audit_net *aunet = net_generic(net, audit_net_id); +	struct sock *sk = audit_get_sk(reply->net);  	mutex_lock(&audit_cmd_mutex);  	mutex_unlock(&audit_cmd_mutex);  	/* Ignore failure. It'll only happen if the sender goes away,  	   because our timeout is set to infinite. */ -	netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); -	put_net(net); +	netlink_unicast(sk, reply->skb, reply->portid, 0); +	put_net(reply->net);  	kfree(reply);  	return 0;  } @@ -949,12 +1070,12 @@ static int audit_set_feature(struct sk_buff *skb)  static int audit_replace(pid_t pid)  { -	struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, -					       &pid, sizeof(pid)); +	struct sk_buff *skb; +	skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));  	if (!skb)  		return -ENOMEM; -	return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +	return auditd_send_unicast_skb(skb);  }  static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -981,7 +1102,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		memset(&s, 0, sizeof(s));  		s.enabled		= audit_enabled;  		s.failure		= audit_failure; -		s.pid			= audit_pid; +		rcu_read_lock(); +		s.pid			= auditd_conn.pid; +		rcu_read_unlock();  		s.rate_limit		= audit_rate_limit;  		s.backlog_limit		= audit_backlog_limit;  		s.lost			= atomic_read(&audit_lost); @@ -1014,30 +1137,44 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  			 *       from the initial pid namespace, but something  			 *       to keep in mind if this changes */  			int new_pid = s.pid; +			pid_t auditd_pid;  			pid_t requesting_pid = task_tgid_vnr(current); -			if ((!new_pid) && (requesting_pid != audit_pid)) { -				audit_log_config_change("audit_pid", new_pid, audit_pid, 0); +			/* test the auditd connection */ +			audit_replace(requesting_pid); + +			rcu_read_lock(); +			auditd_pid = auditd_conn.pid; +			/* only the current auditd can unregister itself */ +			if ((!new_pid) && (requesting_pid != auditd_pid)) { +				rcu_read_unlock(); +				audit_log_config_change("audit_pid", new_pid, +							auditd_pid, 0);  				return -EACCES;  			} -			if (audit_pid && new_pid && -			    audit_replace(requesting_pid) != -ECONNREFUSED) { -				audit_log_config_change("audit_pid", new_pid, audit_pid, 0); +			/* replacing a healthy auditd is not allowed */ +			if (auditd_pid && new_pid) { +				rcu_read_unlock(); +				audit_log_config_change("audit_pid", new_pid, +							auditd_pid, 0);  				return -EEXIST;  			} +			rcu_read_unlock(); +  			if (audit_enabled != AUDIT_OFF) -				audit_log_config_change("audit_pid", new_pid, audit_pid, 1); +				audit_log_config_change("audit_pid", new_pid, +							auditd_pid, 1); +  			if (new_pid) { -				if (audit_sock) -					sock_put(audit_sock); -				audit_pid = new_pid; -				audit_nlk_portid = NETLINK_CB(skb).portid; -				sock_hold(skb->sk); -				audit_sock = skb->sk; -			} else { +				/* register a new auditd connection */ +				auditd_set(new_pid, +					   NETLINK_CB(skb).portid, +					   sock_net(NETLINK_CB(skb).sk)); +				/* try to process any backlog */ +				wake_up_interruptible(&kauditd_wait); +			} else +				/* unregister the auditd connection */  				auditd_reset(); -			} -			wake_up_interruptible(&kauditd_wait);  		}  		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {  			err = audit_set_rate_limit(s.rate_limit); @@ -1090,7 +1227,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  				if (err)  					break;  			} -			mutex_unlock(&audit_cmd_mutex);  			audit_log_common_recv_msg(&ab, msg_type);  			if (msg_type != AUDIT_USER_TTY)  				audit_log_format(ab, " msg='%.*s'", @@ -1108,7 +1244,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  			}  			audit_set_portid(ab, NETLINK_CB(skb).portid);  			audit_log_end(ab); -			mutex_lock(&audit_cmd_mutex);  		}  		break;  	case AUDIT_ADD_RULE: @@ -1298,26 +1433,26 @@ static int __net_init audit_net_init(struct net *net)  	struct audit_net *aunet = net_generic(net, audit_net_id); -	aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); -	if (aunet->nlsk == NULL) { +	aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); +	if (aunet->sk == NULL) {  		audit_panic("cannot initialize netlink socket in namespace");  		return -ENOMEM;  	} -	aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; +	aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; +  	return 0;  }  static void __net_exit audit_net_exit(struct net *net)  {  	struct audit_net *aunet = net_generic(net, audit_net_id); -	struct sock *sock = aunet->nlsk; -	mutex_lock(&audit_cmd_mutex); -	if (sock == audit_sock) + +	rcu_read_lock(); +	if (net == auditd_conn.net)  		auditd_reset(); -	mutex_unlock(&audit_cmd_mutex); +	rcu_read_unlock(); -	netlink_kernel_release(sock); -	aunet->nlsk = NULL; +	netlink_kernel_release(aunet->sk);  }  static struct pernet_operations audit_net_ops __net_initdata = { @@ -1335,20 +1470,24 @@ static int __init audit_init(void)  	if (audit_initialized == AUDIT_DISABLED)  		return 0; -	pr_info("initializing netlink subsys (%s)\n", -		audit_default ? "enabled" : "disabled"); -	register_pernet_subsys(&audit_net_ops); +	memset(&auditd_conn, 0, sizeof(auditd_conn)); +	spin_lock_init(&auditd_conn.lock);  	skb_queue_head_init(&audit_queue);  	skb_queue_head_init(&audit_retry_queue);  	skb_queue_head_init(&audit_hold_queue); -	audit_initialized = AUDIT_INITIALIZED; -	audit_enabled = audit_default; -	audit_ever_enabled |= !!audit_default;  	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)  		INIT_LIST_HEAD(&audit_inode_hash[i]); +	pr_info("initializing netlink subsys (%s)\n", +		audit_default ? "enabled" : "disabled"); +	register_pernet_subsys(&audit_net_ops); + +	audit_initialized = AUDIT_INITIALIZED; +	audit_enabled = audit_default; +	audit_ever_enabled |= !!audit_default; +  	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");  	if (IS_ERR(kauditd_task)) {  		int err = PTR_ERR(kauditd_task); @@ -1519,20 +1658,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,  	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))  		return NULL; -	/* don't ever fail/sleep on these two conditions: +	/* NOTE: don't ever fail/sleep on these two conditions:  	 * 1. auditd generated record - since we need auditd to drain the  	 *    queue; also, when we are checking for auditd, compare PIDs using  	 *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()  	 *    using a PID anchored in the caller's namespace -	 * 2. audit command message - record types 1000 through 1099 inclusive -	 *    are command messages/records used to manage the kernel subsystem -	 *    and the audit userspace, blocking on these messages could cause -	 *    problems under load so don't do it (note: not all of these -	 *    command types are valid as record types, but it is quicker to -	 *    just check two ints than a series of ints in a if/switch stmt) */ -	if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || -	      (type >= 1000 && type <= 1099))) { -		long sleep_time = audit_backlog_wait_time; +	 * 2. generator holding the audit_cmd_mutex - we don't want to block +	 *    while holding the mutex */ +	if (!(auditd_test_task(current) || +	      (current == __mutex_owner(&audit_cmd_mutex)))) { +		long stime = audit_backlog_wait_time;  		while (audit_backlog_limit &&  		       (skb_queue_len(&audit_queue) > audit_backlog_limit)) { @@ -1541,14 +1676,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,  			/* sleep if we are allowed and we haven't exhausted our  			 * backlog wait limit */ -			if ((gfp_mask & __GFP_DIRECT_RECLAIM) && -			    (sleep_time > 0)) { +			if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {  				DECLARE_WAITQUEUE(wait, current);  				add_wait_queue_exclusive(&audit_backlog_wait,  							 &wait);  				set_current_state(TASK_UNINTERRUPTIBLE); -				sleep_time = schedule_timeout(sleep_time); +				stime = schedule_timeout(stime);  				remove_wait_queue(&audit_backlog_wait, &wait);  			} else {  				if (audit_rate_check() && printk_ratelimit()) @@ -2127,15 +2261,27 @@ out:   */  void audit_log_end(struct audit_buffer *ab)  { +	struct sk_buff *skb; +	struct nlmsghdr *nlh; +  	if (!ab)  		return; -	if (!audit_rate_check()) { -		audit_log_lost("rate limit exceeded"); -	} else { -		skb_queue_tail(&audit_queue, ab->skb); -		wake_up_interruptible(&kauditd_wait); + +	if (audit_rate_check()) { +		skb = ab->skb;  		ab->skb = NULL; -	} + +		/* setup the netlink header, see the comments in +		 * kauditd_send_multicast_skb() for length quirks */ +		nlh = nlmsg_hdr(skb); +		nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + +		/* queue the netlink packet and poke the kauditd thread */ +		skb_queue_tail(&audit_queue, skb); +		wake_up_interruptible(&kauditd_wait); +	} else +		audit_log_lost("rate limit exceeded"); +  	audit_buffer_free(ab);  } diff --git a/kernel/audit.h b/kernel/audit.h index ca579880303a..0d87f8ab8778 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,  			   struct audit_names *n, const struct path *path,  			   int record_num, int *call_panic); -extern int audit_pid; +extern int auditd_test_task(const struct task_struct *task);  #define AUDIT_INODE_BUCKETS	32  extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -250,10 +250,6 @@ struct audit_netlink_list {  int audit_send_list(void *); -struct audit_net { -	struct sock *nlsk; -}; -  extern int selinux_audit_rule_update(void);  extern struct mutex audit_filter_mutex; @@ -337,14 +333,7 @@ extern u32 audit_sig_sid;  extern int audit_filter(int msgtype, unsigned int listtype);  #ifdef CONFIG_AUDITSYSCALL -extern int __audit_signal_info(int sig, struct task_struct *t); -static inline int audit_signal_info(int sig, struct task_struct *t) -{ -	if (unlikely((audit_pid && t->tgid == audit_pid) || -		     (audit_signals && !audit_dummy_context()))) -		return __audit_signal_info(sig, t); -	return 0; -} +extern int audit_signal_info(int sig, struct task_struct *t);  extern void audit_filter_inodes(struct task_struct *, struct audit_context *);  extern struct list_head *audit_killed_trees(void);  #else diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d6a8de5f8fa3..1c2333155893 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -762,7 +762,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,  	struct audit_entry *e;  	enum audit_state state; -	if (audit_pid && tsk->tgid == audit_pid) +	if (auditd_test_task(tsk))  		return AUDIT_DISABLED;  	rcu_read_lock(); @@ -816,7 +816,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)  {  	struct audit_names *n; -	if (audit_pid && tsk->tgid == audit_pid) +	if (auditd_test_task(tsk))  		return;  	rcu_read_lock(); @@ -2249,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t)   * If the audit subsystem is being terminated, record the task (pid)   * and uid that is doing that.   */ -int __audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info(int sig, struct task_struct *t)  {  	struct audit_aux_data_pids *axp;  	struct task_struct *tsk = current;  	struct audit_context *ctx = tsk->audit_context;  	kuid_t uid = current_uid(), t_uid = task_uid(t); -	if (audit_pid && t->tgid == audit_pid) { -		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { -			audit_sig_pid = task_tgid_nr(tsk); -			if (uid_valid(tsk->loginuid)) -				audit_sig_uid = tsk->loginuid; -			else -				audit_sig_uid = uid; -			security_task_getsecid(tsk, &audit_sig_sid); -		} -		if (!audit_signals || audit_dummy_context()) -			return 0; +	if (auditd_test_task(t) && +	    (sig == SIGTERM || sig == SIGHUP || +	     sig == SIGUSR1 || sig == SIGUSR2)) { +		audit_sig_pid = task_tgid_nr(tsk); +		if (uid_valid(tsk->loginuid)) +			audit_sig_uid = tsk->loginuid; +		else +			audit_sig_uid = uid; +		security_task_getsecid(tsk, &audit_sig_sid);  	} +	if (!audit_signals || audit_dummy_context()) +		return 0; +  	/* optimize the common case by putting first signal recipient directly  	 * in audit_context */  	if (!ctx->target_pid) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f45827e205d3..b4f1cb0c5ac7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1162,12 +1162,12 @@ out:  	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */  		off = IMM;  load_word: -		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are -		 * only appearing in the programs where ctx == -		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] -		 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, -		 * internal BPF verifier will check that BPF_R6 == -		 * ctx. +		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only +		 * appearing in the programs where ctx == skb +		 * (see may_access_skb() in the verifier). All programs +		 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, +		 * bpf_convert_filter() saves it in BPF_R6, internal BPF +		 * verifier will check that BPF_R6 == ctx.  		 *  		 * BPF_ABS and BPF_IND are wrappers of function calls,  		 * so they scratch BPF_R1-BPF_R5 registers, preserve diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ea87fb19a94..361a69dfe543 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,11 +13,12 @@  #include <linux/bpf.h>  #include <linux/jhash.h>  #include <linux/filter.h> +#include <linux/rculist_nulls.h>  #include "percpu_freelist.h"  #include "bpf_lru_list.h"  struct bucket { -	struct hlist_head head; +	struct hlist_nulls_head head;  	raw_spinlock_t lock;  }; @@ -29,28 +30,26 @@ struct bpf_htab {  		struct pcpu_freelist freelist;  		struct bpf_lru lru;  	}; -	void __percpu *extra_elems; +	struct htab_elem *__percpu *extra_elems;  	atomic_t count;	/* number of elements in this hashtable */  	u32 n_buckets;	/* number of hash buckets */  	u32 elem_size;	/* size of each element in bytes */  }; -enum extra_elem_state { -	HTAB_NOT_AN_EXTRA_ELEM = 0, -	HTAB_EXTRA_ELEM_FREE, -	HTAB_EXTRA_ELEM_USED -}; -  /* each htab element is struct htab_elem + key + value */  struct htab_elem {  	union { -		struct hlist_node hash_node; -		struct bpf_htab *htab; -		struct pcpu_freelist_node fnode; +		struct hlist_nulls_node hash_node; +		struct { +			void *padding; +			union { +				struct bpf_htab *htab; +				struct pcpu_freelist_node fnode; +			}; +		};  	};  	union {  		struct rcu_head rcu; -		enum extra_elem_state state;  		struct bpf_lru_node lru_node;  	};  	u32 hash; @@ -71,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)  		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;  } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ +	return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} +  static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,  				     void __percpu *pptr)  { @@ -122,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,  static int prealloc_init(struct bpf_htab *htab)  { +	u32 num_entries = htab->map.max_entries;  	int err = -ENOMEM, i; -	htab->elems = bpf_map_area_alloc(htab->elem_size * -					 htab->map.max_entries); +	if (!htab_is_percpu(htab) && !htab_is_lru(htab)) +		num_entries += num_possible_cpus(); + +	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);  	if (!htab->elems)  		return -ENOMEM;  	if (!htab_is_percpu(htab))  		goto skip_percpu_elems; -	for (i = 0; i < htab->map.max_entries; i++) { +	for (i = 0; i < num_entries; i++) {  		u32 size = round_up(htab->map.value_size, 8);  		void __percpu *pptr; @@ -160,10 +167,11 @@ skip_percpu_elems:  	if (htab_is_lru(htab))  		bpf_lru_populate(&htab->lru, htab->elems,  				 offsetof(struct htab_elem, lru_node), -				 htab->elem_size, htab->map.max_entries); +				 htab->elem_size, num_entries);  	else -		pcpu_freelist_populate(&htab->freelist, htab->elems, -				       htab->elem_size, htab->map.max_entries); +		pcpu_freelist_populate(&htab->freelist, +				       htab->elems + offsetof(struct htab_elem, fnode), +				       htab->elem_size, num_entries);  	return 0; @@ -184,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)  static int alloc_extra_elems(struct bpf_htab *htab)  { -	void __percpu *pptr; +	struct htab_elem *__percpu *pptr, *l_new; +	struct pcpu_freelist_node *l;  	int cpu; -	pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); +	pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, +				  GFP_USER | __GFP_NOWARN);  	if (!pptr)  		return -ENOMEM;  	for_each_possible_cpu(cpu) { -		((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = -			HTAB_EXTRA_ELEM_FREE; +		l = pcpu_freelist_pop(&htab->freelist); +		/* pop will succeed, since prealloc_init() +		 * preallocated extra num_possible_cpus elements +		 */ +		l_new = container_of(l, struct htab_elem, fnode); +		*per_cpu_ptr(pptr, cpu) = l_new;  	}  	htab->extra_elems = pptr;  	return 0; @@ -217,6 +231,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  	int err, i;  	u64 cost; +	BUILD_BUG_ON(offsetof(struct htab_elem, htab) != +		     offsetof(struct htab_elem, hash_node.pprev)); +	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != +		     offsetof(struct htab_elem, hash_node.pprev)); +  	if (lru && !capable(CAP_SYS_ADMIN))  		/* LRU implementation is much complicated than other  		 * maps.  Hence, limit to CAP_SYS_ADMIN for now. @@ -326,29 +345,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		goto free_htab;  	for (i = 0; i < htab->n_buckets; i++) { -		INIT_HLIST_HEAD(&htab->buckets[i].head); +		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);  		raw_spin_lock_init(&htab->buckets[i].lock);  	} -	if (!percpu && !lru) { -		/* lru itself can remove the least used element, so -		 * there is no need for an extra elem during map_update. -		 */ -		err = alloc_extra_elems(htab); -		if (err) -			goto free_buckets; -	} -  	if (prealloc) {  		err = prealloc_init(htab);  		if (err) -			goto free_extra_elems; +			goto free_buckets; + +		if (!percpu && !lru) { +			/* lru itself can remove the least used element, so +			 * there is no need for an extra elem during map_update. +			 */ +			err = alloc_extra_elems(htab); +			if (err) +				goto free_prealloc; +		}  	}  	return &htab->map; -free_extra_elems: -	free_percpu(htab->extra_elems); +free_prealloc: +	prealloc_destroy(htab);  free_buckets:  	bpf_map_area_free(htab->buckets);  free_htab: @@ -366,20 +385,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)  	return &htab->buckets[hash & (htab->n_buckets - 1)];  } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)  {  	return &__select_bucket(htab, hash)->head;  } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +/* this lookup function can only be called with bucket lock taken */ +static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,  					 void *key, u32 key_size)  { +	struct hlist_nulls_node *n; +	struct htab_elem *l; + +	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) +		if (l->hash == hash && !memcmp(&l->key, key, key_size)) +			return l; + +	return NULL; +} + +/* can be called without bucket lock. it will repeat the loop in + * the unlikely event when elements moved from one bucket into another + * while link list is being walked + */ +static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, +					       u32 hash, void *key, +					       u32 key_size, u32 n_buckets) +{ +	struct hlist_nulls_node *n;  	struct htab_elem *l; -	hlist_for_each_entry_rcu(l, head, hash_node) +again: +	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)  		if (l->hash == hash && !memcmp(&l->key, key, key_size))  			return l; +	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) +		goto again; +  	return NULL;  } @@ -387,7 +430,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,  static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	struct htab_elem *l;  	u32 hash, key_size; @@ -400,7 +443,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)  	head = select_bucket(htab, hash); -	l = lookup_elem_raw(head, hash, key, key_size); +	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);  	return l;  } @@ -433,8 +476,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)  {  	struct bpf_htab *htab = (struct bpf_htab *)arg; -	struct htab_elem *l, *tgt_l; -	struct hlist_head *head; +	struct htab_elem *l = NULL, *tgt_l; +	struct hlist_nulls_head *head; +	struct hlist_nulls_node *n;  	unsigned long flags;  	struct bucket *b; @@ -444,9 +488,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)  	raw_spin_lock_irqsave(&b->lock, flags); -	hlist_for_each_entry_rcu(l, head, hash_node) +	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)  		if (l == tgt_l) { -			hlist_del_rcu(&l->hash_node); +			hlist_nulls_del_rcu(&l->hash_node);  			break;  		} @@ -459,7 +503,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	struct htab_elem *l, *next_l;  	u32 hash, key_size;  	int i; @@ -473,7 +517,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	head = select_bucket(htab, hash);  	/* lookup the key */ -	l = lookup_elem_raw(head, hash, key, key_size); +	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);  	if (!l) {  		i = 0; @@ -481,7 +525,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	}  	/* key was found, get next key in the same bucket */ -	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), +	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),  				  struct htab_elem, hash_node);  	if (next_l) { @@ -500,7 +544,7 @@ find_first_elem:  		head = select_bucket(htab, i);  		/* pick first element in the bucket */ -		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), +		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),  					  struct htab_elem, hash_node);  		if (next_l) {  			/* if it's not empty, just return it */ @@ -538,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)  { -	if (l->state == HTAB_EXTRA_ELEM_USED) { -		l->state = HTAB_EXTRA_ELEM_FREE; -		return; -	} - -	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { +	if (htab_is_prealloc(htab)) {  		pcpu_freelist_push(&htab->freelist, &l->fnode);  	} else {  		atomic_dec(&htab->count); @@ -573,43 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  					 void *value, u32 key_size, u32 hash,  					 bool percpu, bool onallcpus, -					 bool old_elem_exists) +					 struct htab_elem *old_elem)  {  	u32 size = htab->map.value_size; -	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); -	struct htab_elem *l_new; +	bool prealloc = htab_is_prealloc(htab); +	struct htab_elem *l_new, **pl_new;  	void __percpu *pptr; -	int err = 0;  	if (prealloc) { -		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); -		if (!l_new) -			err = -E2BIG; -	} else { -		if (atomic_inc_return(&htab->count) > htab->map.max_entries) { -			atomic_dec(&htab->count); -			err = -E2BIG; +		if (old_elem) { +			/* if we're updating the existing element, +			 * use per-cpu extra elems to avoid freelist_pop/push +			 */ +			pl_new = this_cpu_ptr(htab->extra_elems); +			l_new = *pl_new; +			*pl_new = old_elem;  		} else { -			l_new = kmalloc(htab->elem_size, -					GFP_ATOMIC | __GFP_NOWARN); -			if (!l_new) -				return ERR_PTR(-ENOMEM); -		} -	} +			struct pcpu_freelist_node *l; -	if (err) { -		if (!old_elem_exists) -			return ERR_PTR(err); - -		/* if we're updating the existing element and the hash table -		 * is full, use per-cpu extra elems -		 */ -		l_new = this_cpu_ptr(htab->extra_elems); -		if (l_new->state != HTAB_EXTRA_ELEM_FREE) -			return ERR_PTR(-E2BIG); -		l_new->state = HTAB_EXTRA_ELEM_USED; +			l = pcpu_freelist_pop(&htab->freelist); +			if (!l) +				return ERR_PTR(-E2BIG); +			l_new = container_of(l, struct htab_elem, fnode); +		}  	} else { -		l_new->state = HTAB_NOT_AN_EXTRA_ELEM; +		if (atomic_inc_return(&htab->count) > htab->map.max_entries) +			if (!old_elem) { +				/* when map is full and update() is replacing +				 * old element, it's ok to allocate, since +				 * old element will be freed immediately. +				 * Otherwise return an error +				 */ +				atomic_dec(&htab->count); +				return ERR_PTR(-E2BIG); +			} +		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); +		if (!l_new) +			return ERR_PTR(-ENOMEM);  	}  	memcpy(l_new->key, key, key_size); @@ -661,7 +700,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	unsigned long flags;  	struct bucket *b;  	u32 key_size, hash; @@ -690,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  		goto err;  	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, -				!!l_old); +				l_old);  	if (IS_ERR(l_new)) {  		/* all pre-allocated elements are in use or memory exhausted */  		ret = PTR_ERR(l_new); @@ -700,10 +739,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,  	/* add new element to the head of the list, so that  	 * concurrent search will find it before old elem  	 */ -	hlist_add_head_rcu(&l_new->hash_node, head); +	hlist_nulls_add_head_rcu(&l_new->hash_node, head);  	if (l_old) { -		hlist_del_rcu(&l_old->hash_node); -		free_htab_elem(htab, l_old); +		hlist_nulls_del_rcu(&l_old->hash_node); +		if (!htab_is_prealloc(htab)) +			free_htab_elem(htab, l_old);  	}  	ret = 0;  err: @@ -716,7 +756,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new, *l_old = NULL; -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	unsigned long flags;  	struct bucket *b;  	u32 key_size, hash; @@ -757,10 +797,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,  	/* add new element to the head of the list, so that  	 * concurrent search will find it before old elem  	 */ -	hlist_add_head_rcu(&l_new->hash_node, head); +	hlist_nulls_add_head_rcu(&l_new->hash_node, head);  	if (l_old) {  		bpf_lru_node_set_ref(&l_new->lru_node); -		hlist_del_rcu(&l_old->hash_node); +		hlist_nulls_del_rcu(&l_old->hash_node);  	}  	ret = 0; @@ -781,7 +821,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	unsigned long flags;  	struct bucket *b;  	u32 key_size, hash; @@ -815,12 +855,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  				value, onallcpus);  	} else {  		l_new = alloc_htab_elem(htab, key, value, key_size, -					hash, true, onallcpus, false); +					hash, true, onallcpus, NULL);  		if (IS_ERR(l_new)) {  			ret = PTR_ERR(l_new);  			goto err;  		} -		hlist_add_head_rcu(&l_new->hash_node, head); +		hlist_nulls_add_head_rcu(&l_new->hash_node, head);  	}  	ret = 0;  err: @@ -834,7 +874,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	unsigned long flags;  	struct bucket *b;  	u32 key_size, hash; @@ -882,7 +922,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  	} else {  		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),  				value, onallcpus); -		hlist_add_head_rcu(&l_new->hash_node, head); +		hlist_nulls_add_head_rcu(&l_new->hash_node, head);  		l_new = NULL;  	}  	ret = 0; @@ -910,7 +950,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,  static int htab_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	struct bucket *b;  	struct htab_elem *l;  	unsigned long flags; @@ -930,7 +970,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  	l = lookup_elem_raw(head, hash, key, key_size);  	if (l) { -		hlist_del_rcu(&l->hash_node); +		hlist_nulls_del_rcu(&l->hash_node);  		free_htab_elem(htab, l);  		ret = 0;  	} @@ -942,7 +982,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; +	struct hlist_nulls_head *head;  	struct bucket *b;  	struct htab_elem *l;  	unsigned long flags; @@ -962,7 +1002,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)  	l = lookup_elem_raw(head, hash, key, key_size);  	if (l) { -		hlist_del_rcu(&l->hash_node); +		hlist_nulls_del_rcu(&l->hash_node);  		ret = 0;  	} @@ -977,14 +1017,13 @@ static void delete_all_elements(struct bpf_htab *htab)  	int i;  	for (i = 0; i < htab->n_buckets; i++) { -		struct hlist_head *head = select_bucket(htab, i); -		struct hlist_node *n; +		struct hlist_nulls_head *head = select_bucket(htab, i); +		struct hlist_nulls_node *n;  		struct htab_elem *l; -		hlist_for_each_entry_safe(l, n, head, hash_node) { -			hlist_del_rcu(&l->hash_node); -			if (l->state != HTAB_EXTRA_ELEM_USED) -				htab_elem_free(htab, l); +		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { +			hlist_nulls_del_rcu(&l->hash_node); +			htab_elem_free(htab, l);  		}  	}  } @@ -1004,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)  	 * not have executed. Wait for them.  	 */  	rcu_barrier(); -	if (htab->map.map_flags & BPF_F_NO_PREALLOC) +	if (!htab_is_prealloc(htab))  		delete_all_elements(htab);  	else  		prealloc_destroy(htab); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8bfe0afaee10..b37bd9ab7f57 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -500,9 +500,15 @@ unlock:  	raw_spin_unlock(&trie->lock);  } +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ +	return -ENOTSUPP; +} +  static const struct bpf_map_ops trie_ops = {  	.map_alloc = trie_alloc,  	.map_free = trie_free, +	.map_get_next_key = trie_get_next_key,  	.map_lookup_elem = trie_lookup_elem,  	.map_update_elem = trie_update_elem,  	.map_delete_elem = trie_delete_elem, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7af0dcc5d755..821f9e807de5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -617,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog)  			if (insn->imm == BPF_FUNC_xdp_adjust_head)  				prog->xdp_adjust_head = 1;  			if (insn->imm == BPF_FUNC_tail_call) { +				/* If we tail call into other programs, we +				 * cannot make any assumptions since they +				 * can be replaced dynamically during runtime +				 * in the program array. +				 */ +				prog->cb_access = 1; +				prog->xdp_adjust_head = 1; +  				/* mark bpf_tail_call as different opcode  				 * to avoid conditional branch in  				 * interpeter for every normal call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 796b68d00119..a834068a400e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)  	}  } -static int check_ptr_alignment(struct bpf_verifier_env *env, -			       struct bpf_reg_state *reg, int off, int size) +static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +				   int off, int size)  { -	if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { -		if (off % size != 0) { -			verbose("misaligned access off %d size %d\n", -				off, size); -			return -EACCES; -		} else { -			return 0; -		} -	} - -	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) -		/* misaligned access to packet is ok on x86,arm,arm64 */ -		return 0; -  	if (reg->id && size != 1) { -		verbose("Unknown packet alignment. Only byte-sized access allowed\n"); +		verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");  		return -EACCES;  	}  	/* skb->data is NET_IP_ALIGN-ed */ -	if (reg->type == PTR_TO_PACKET && -	    (NET_IP_ALIGN + reg->off + off) % size != 0) { +	if ((NET_IP_ALIGN + reg->off + off) % size != 0) {  		verbose("misaligned packet access off %d+%d+%d size %d\n",  			NET_IP_ALIGN, reg->off, off, size);  		return -EACCES;  	} +  	return 0;  } +static int check_val_ptr_alignment(const struct bpf_reg_state *reg, +				   int size) +{ +	if (size != 1) { +		verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); +		return -EACCES; +	} + +	return 0; +} + +static int check_ptr_alignment(const struct bpf_reg_state *reg, +			       int off, int size) +{ +	switch (reg->type) { +	case PTR_TO_PACKET: +		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : +		       check_pkt_ptr_alignment(reg, off, size); +	case PTR_TO_MAP_VALUE_ADJ: +		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : +		       check_val_ptr_alignment(reg, size); +	default: +		if (off % size != 0) { +			verbose("misaligned access off %d size %d\n", +				off, size); +			return -EACCES; +		} + +		return 0; +	} +} +  /* check whether memory at (regno + off) is accessible for t = (read | write)   * if t==write, value_regno is a register which value is stored into memory   * if t==read, value_regno is a register which will receive the value from memory @@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  	if (size < 0)  		return size; -	err = check_ptr_alignment(env, reg, off, size); +	err = check_ptr_alignment(reg, off, size);  	if (err)  		return err; @@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  		 * register as unknown.  		 */  		if (env->allow_ptr_leaks && +		    BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&  		    (dst_reg->type == PTR_TO_MAP_VALUE ||  		     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))  			dst_reg->type = PTR_TO_MAP_VALUE_ADJ; @@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	for (i = 0; i < MAX_BPF_REG; i++)  		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) -			regs[i].range = dst_reg->off; +			/* keep the maximum range already checked */ +			regs[i].range = max(regs[i].range, dst_reg->off);  	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {  		if (state->stack_slot_type[i] != STACK_SPILL)  			continue;  		reg = &state->spilled_regs[i / BPF_REG_SIZE];  		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) -			reg->range = dst_reg->off; +			reg->range = max(reg->range, dst_reg->off);  	}  } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 56eba9caa632..1dc22f6b49f5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1329,7 +1329,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  		struct task_struct *task;  		int count = 0; -		seq_printf(seq, "css_set %p\n", cset); +		seq_printf(seq, "css_set %pK\n", cset);  		list_for_each_entry(task, &cset->tasks, cg_list) {  			if (count++ > MAX_TASKS_SHOWN_PER_CSS) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0125589c7428..687f5e0194ef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,  		tsk = tsk->group_leader;  	/* -	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become -	 * trapped in a cpuset, or RT worker may be born in a cgroup -	 * with no rt_runtime allocated.  Just say no. +	 * kthreads may acquire PF_NO_SETAFFINITY during initialization. +	 * If userland migrates such a kthread to a non-root cgroup, it can +	 * become trapped in a cpuset, or RT kthread may be born in a +	 * cgroup with no rt_runtime allocated.  Just say no.  	 */ -	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { +	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {  		ret = -EINVAL;  		goto out_unlock_rcu;  	} @@ -2669,7 +2670,7 @@ static bool css_visible(struct cgroup_subsys_state *css)   *   * Returns 0 on success, -errno on failure.  On failure, csses which have   * been processed already aren't cleaned up.  The caller is responsible for - * cleaning up with cgroup_apply_control_disble(). + * cleaning up with cgroup_apply_control_disable().   */  static int cgroup_apply_control_enable(struct cgroup *cgrp)  { diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index e756dae49300..2237201d66d5 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task)  		/* Only log the first time events_limit is incremented. */  		if (atomic64_inc_return(&pids->events_limit) == 1) {  			pr_info("cgroup: fork rejected by pids controller in "); -			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); +			pr_cont_cgroup_path(css->cgroup);  			pr_cont("\n");  		}  		cgroup_file_notify(&pids->events_file); diff --git a/kernel/cpu.c b/kernel/cpu.c index f7c063239fa5..37b223e4fc05 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1335,26 +1335,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,  	struct cpuhp_step *sp;  	int ret = 0; -	mutex_lock(&cpuhp_state_mutex); -  	if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {  		ret = cpuhp_reserve_state(state);  		if (ret < 0) -			goto out; +			return ret;  		state = ret;  	}  	sp = cpuhp_get_step(state); -	if (name && sp->name) { -		ret = -EBUSY; -		goto out; -	} +	if (name && sp->name) +		return -EBUSY; +  	sp->startup.single = startup;  	sp->teardown.single = teardown;  	sp->name = name;  	sp->multi_instance = multi_instance;  	INIT_HLIST_HEAD(&sp->list); -out: -	mutex_unlock(&cpuhp_state_mutex);  	return ret;  } @@ -1428,6 +1423,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,  		return -EINVAL;  	get_online_cpus(); +	mutex_lock(&cpuhp_state_mutex);  	if (!invoke || !sp->startup.multi)  		goto add_node; @@ -1447,16 +1443,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,  		if (ret) {  			if (sp->teardown.multi)  				cpuhp_rollback_install(cpu, state, node); -			goto err; +			goto unlock;  		}  	}  add_node:  	ret = 0; -	mutex_lock(&cpuhp_state_mutex);  	hlist_add_head(node, &sp->list); +unlock:  	mutex_unlock(&cpuhp_state_mutex); - -err:  	put_online_cpus();  	return ret;  } @@ -1491,6 +1485,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,  		return -EINVAL;  	get_online_cpus(); +	mutex_lock(&cpuhp_state_mutex);  	ret = cpuhp_store_callbacks(state, name, startup, teardown,  				    multi_instance); @@ -1524,6 +1519,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,  		}  	}  out: +	mutex_unlock(&cpuhp_state_mutex);  	put_online_cpus();  	/*  	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the @@ -1547,6 +1543,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,  		return -EINVAL;  	get_online_cpus(); +	mutex_lock(&cpuhp_state_mutex); +  	if (!invoke || !cpuhp_get_teardown_cb(state))  		goto remove;  	/* @@ -1563,7 +1561,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,  	}  remove: -	mutex_lock(&cpuhp_state_mutex);  	hlist_del(node);  	mutex_unlock(&cpuhp_state_mutex);  	put_online_cpus(); @@ -1571,6 +1568,7 @@ remove:  	return 0;  }  EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); +  /**   * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state   * @state:	The state to remove @@ -1589,6 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)  	get_online_cpus(); +	mutex_lock(&cpuhp_state_mutex);  	if (sp->multi_instance) {  		WARN(!hlist_empty(&sp->list),  		     "Error: Removing state %d which has instances left.\n", @@ -1613,6 +1612,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)  	}  remove:  	cpuhp_store_callbacks(state, NULL, NULL, NULL, false); +	mutex_unlock(&cpuhp_state_mutex);  	put_online_cpus();  }  EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f41548f2e32..ff01cba86f43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -998,7 +998,7 @@ list_update_cgroup_event(struct perf_event *event,   */  #define PERF_CPU_HRTIMER (1000 / HZ)  /* - * function must be called with interrupts disbled + * function must be called with interrupts disabled   */  static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)  { @@ -4256,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event)  	raw_spin_lock_irq(&ctx->lock);  	/* -	 * Mark this even as STATE_DEAD, there is no external reference to it +	 * Mark this event as STATE_DEAD, there is no external reference to it  	 * anymore.  	 *  	 * Anybody acquiring event->child_mutex after the below loop _must_ @@ -10417,21 +10417,22 @@ void perf_event_free_task(struct task_struct *task)  			continue;  		mutex_lock(&ctx->mutex); -again: -		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, -				group_entry) -			perf_free_event(event, ctx); +		raw_spin_lock_irq(&ctx->lock); +		/* +		 * Destroy the task <-> ctx relation and mark the context dead. +		 * +		 * This is important because even though the task hasn't been +		 * exposed yet the context has been (through child_list). +		 */ +		RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); +		WRITE_ONCE(ctx->task, TASK_TOMBSTONE); +		put_task_struct(task); /* cannot be last */ +		raw_spin_unlock_irq(&ctx->lock); -		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, -				group_entry) +		list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)  			perf_free_event(event, ctx); -		if (!list_empty(&ctx->pinned_groups) || -				!list_empty(&ctx->flexible_groups)) -			goto again; -  		mutex_unlock(&ctx->mutex); -  		put_ctx(ctx);  	}  } @@ -10469,7 +10470,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)  }  /* - * inherit a event from parent task to child task: + * Inherit a event from parent task to child task. + * + * Returns: + *  - valid pointer on success + *  - NULL for orphaned events + *  - IS_ERR() on error   */  static struct perf_event *  inherit_event(struct perf_event *parent_event, @@ -10563,6 +10569,16 @@ inherit_event(struct perf_event *parent_event,  	return child_event;  } +/* + * Inherits an event group. + * + * This will quietly suppress orphaned events; !inherit_event() is not an error. + * This matches with perf_event_release_kernel() removing all child events. + * + * Returns: + *  - 0 on success + *  - <0 on error + */  static int inherit_group(struct perf_event *parent_event,  	      struct task_struct *parent,  	      struct perf_event_context *parent_ctx, @@ -10577,6 +10593,11 @@ static int inherit_group(struct perf_event *parent_event,  				 child, NULL, child_ctx);  	if (IS_ERR(leader))  		return PTR_ERR(leader); +	/* +	 * @leader can be NULL here because of is_orphaned_event(). In this +	 * case inherit_event() will create individual events, similar to what +	 * perf_group_detach() would do anyway. +	 */  	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {  		child_ctr = inherit_event(sub, parent, parent_ctx,  					    child, leader, child_ctx); @@ -10586,6 +10607,17 @@ static int inherit_group(struct perf_event *parent_event,  	return 0;  } +/* + * Creates the child task context and tries to inherit the event-group. + * + * Clears @inherited_all on !attr.inherited or error. Note that we'll leave + * inherited_all set when we 'fail' to inherit an orphaned event; this is + * consistent with perf_event_release_kernel() removing all child events. + * + * Returns: + *  - 0 on success + *  - <0 on error + */  static int  inherit_task_group(struct perf_event *event, struct task_struct *parent,  		   struct perf_event_context *parent_ctx, @@ -10608,7 +10640,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,  		 * First allocate and initialize a context for the  		 * child.  		 */ -  		child_ctx = alloc_perf_context(parent_ctx->pmu, child);  		if (!child_ctx)  			return -ENOMEM; @@ -10670,7 +10701,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)  		ret = inherit_task_group(event, parent, parent_ctx,  					 child, ctxn, &inherited_all);  		if (ret) -			break; +			goto out_unlock;  	}  	/* @@ -10686,7 +10717,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)  		ret = inherit_task_group(event, parent, parent_ctx,  					 child, ctxn, &inherited_all);  		if (ret) -			break; +			goto out_unlock;  	}  	raw_spin_lock_irqsave(&parent_ctx->lock, flags); @@ -10714,6 +10745,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)  	}  	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); +out_unlock:  	mutex_unlock(&parent_ctx->mutex);  	perf_unpin_context(parent_ctx); diff --git a/kernel/exit.c b/kernel/exit.c index e126ebf2400c..516acdb0e0ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -554,7 +554,6 @@ static void exit_mm(void)  	enter_lazy_tlb(mm, current);  	task_unlock(current);  	mm_update_next_owner(mm); -	userfaultfd_exit(mm);  	mmput(mm);  	if (test_thread_flag(TIF_MEMDIE))  		exit_oom_victim(); diff --git a/kernel/futex.c b/kernel/futex.c index 229a744b1781..45858ec73941 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2815,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  {  	struct hrtimer_sleeper timeout, *to = NULL;  	struct rt_mutex_waiter rt_waiter; -	struct rt_mutex *pi_mutex = NULL;  	struct futex_hash_bucket *hb;  	union futex_key key2 = FUTEX_KEY_INIT;  	struct futex_q q = futex_q_init; @@ -2899,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  		if (q.pi_state && (q.pi_state->owner != current)) {  			spin_lock(q.lock_ptr);  			ret = fixup_pi_state_owner(uaddr2, &q, current); +			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) +				rt_mutex_unlock(&q.pi_state->pi_mutex);  			/*  			 * Drop the reference to the pi state which  			 * the requeue_pi() code acquired for us. @@ -2907,6 +2908,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  			spin_unlock(q.lock_ptr);  		}  	} else { +		struct rt_mutex *pi_mutex; +  		/*  		 * We have been woken up by futex_unlock_pi(), a timeout, or a  		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor @@ -2930,18 +2933,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  		if (res)  			ret = (res < 0) ? res : 0; +		/* +		 * If fixup_pi_state_owner() faulted and was unable to handle +		 * the fault, unlock the rt_mutex and return the fault to +		 * userspace. +		 */ +		if (ret && rt_mutex_owner(pi_mutex) == current) +			rt_mutex_unlock(pi_mutex); +  		/* Unqueue and drop the lock. */  		unqueue_me_pi(&q);  	} -	/* -	 * If fixup_pi_state_owner() faulted and was unable to handle the -	 * fault, unlock the rt_mutex and return the fault to userspace. -	 */ -	if (ret == -EFAULT) { -		if (pi_mutex && rt_mutex_owner(pi_mutex) == current) -			rt_mutex_unlock(pi_mutex); -	} else if (ret == -EINTR) { +	if (ret == -EINTR) {  		/*  		 * We've already been requeued, but cannot restart by calling  		 * futex_lock_pi() directly. We could restart this syscall, but diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4544b115f5eb..e2d356dd7581 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)  struct cpumask *  irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)  { -	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; +	int n, nodes, cpus_per_vec, extra_vecs, curvec;  	int affv = nvecs - affd->pre_vectors - affd->post_vectors;  	int last_affv = affv + affd->pre_vectors;  	nodemask_t nodemsk = NODE_MASK_NONE; @@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)  		goto done;  	} -	/* Spread the vectors per node */ -	vecs_per_node = affv / nodes; -	/* Account for rounding errors */ -	extra_vecs = affv - (nodes * vecs_per_node); -  	for_each_node_mask(n, nodemsk) { -		int ncpus, v, vecs_to_assign = vecs_per_node; +		int ncpus, v, vecs_to_assign, vecs_per_node; + +		/* Spread the vectors per node */ +		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;  		/* Get the cpus on this node which are in the mask */  		cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));  		/* Calculate the number of cpus per vector */  		ncpus = cpumask_weight(nmsk); +		vecs_to_assign = min(vecs_per_node, ncpus); + +		/* Account for rounding errors */ +		extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);  		for (v = 0; curvec < last_affv && v < vecs_to_assign;  		     curvec++, v++) { @@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)  			/* Account for extra vectors to compensate rounding errors */  			if (extra_vecs) {  				cpus_per_vec++; -				if (!--extra_vecs) -					vecs_per_node++; +				--extra_vecs;  			}  			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);  		}  		if (curvec >= last_affv)  			break; +		--nodes;  	}  done: diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b56a558e406d..b118735fea9d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image)  		ret = crypto_shash_final(desc, digest);  		if (ret)  			goto out_free_digest; -		ret = kexec_purgatory_get_set_symbol(image, "sha_regions", -						sha_regions, sha_region_sz, 0); +		ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", +						     sha_regions, sha_region_sz, 0);  		if (ret)  			goto out_free_digest; -		ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", -						digest, SHA256_DIGEST_SIZE, 0); +		ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", +						     digest, SHA256_DIGEST_SIZE, 0);  		if (ret)  			goto out_free_digest;  	} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 4cef7e4706b0..799a8a452187 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image,  extern struct mutex kexec_mutex;  #ifdef CONFIG_KEXEC_FILE -struct kexec_sha_region { -	unsigned long start; -	unsigned long len; -}; - +#include <linux/purgatory.h>  void kimage_file_post_load_cleanup(struct kimage *image);  #else /* CONFIG_KEXEC_FILE */  static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/kthread.c b/kernel/kthread.c index 2f26adea0f84..26db528c1d88 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@  #include <linux/freezer.h>  #include <linux/ptrace.h>  #include <linux/uaccess.h> +#include <linux/cgroup.h>  #include <trace/events/sched.h>  static DEFINE_SPINLOCK(kthread_create_lock); @@ -225,6 +226,7 @@ static int kthread(void *_create)  	ret = -EINTR;  	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { +		cgroup_kthread_ready();  		__kthread_parkme(self);  		ret = threadfn(data);  	} @@ -538,6 +540,7 @@ int kthreadd(void *unused)  	set_mems_allowed(node_states[N_MEMORY]);  	current->flags |= PF_NOFREEZE; +	cgroup_init_kthreadd();  	for (;;) {  		set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 12e38c213b70..a95e5d1f4a9c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3262,10 +3262,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,  	if (depth) {  		hlock = curr->held_locks + depth - 1;  		if (hlock->class_idx == class_idx && nest_lock) { -			if (hlock->references) +			if (hlock->references) { +				/* +				 * Check: unsigned int references:12, overflow. +				 */ +				if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) +					return 0; +  				hlock->references++; -			else +			} else {  				hlock->references = 2; +			}  			return 1;  		} diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c2b88490d857..c08fbd2f5ba9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,13 +46,13 @@ enum {  		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)  /* - * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,   * .data and .bss to fit in required 32MB limit for the kernel. With - * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.   * So, reduce the static allocations for lockdeps related structures so that   * everything fits in current required size limit.   */ -#ifdef CONFIG_PROVE_LOCKING_SMALL +#ifdef CONFIG_LOCKDEP_SMALL  /*   * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies   * we track. diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 7bc24d477805..c65f7989f850 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -213,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)  		 */  		if (sem->count == 0)  			break; -		if (signal_pending_state(state, current)) { -			ret = -EINTR; -			goto out; -		} +		if (signal_pending_state(state, current)) +			goto out_nolock; +  		set_current_state(state);  		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);  		schedule(); @@ -224,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)  	}  	/* got the lock */  	sem->count = -1; -out:  	list_del(&waiter.list);  	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);  	return ret; + +out_nolock: +	list_del(&waiter.list); +	if (!list_empty(&sem->wait_list)) +		__rwsem_do_wake(sem, 1); +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return -EINTR;  }  void __sched __down_write(struct rw_semaphore *sem) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index da6c9a34f62f..6b7abb334ca6 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -50,7 +50,7 @@ static void test_mutex_work(struct work_struct *work)  	if (mtx->flags & TEST_MTX_TRY) {  		while (!ww_mutex_trylock(&mtx->mutex)) -			cpu_relax(); +			cond_resched();  	} else {  		ww_mutex_lock(&mtx->mutex, NULL);  	} @@ -88,7 +88,7 @@ static int __test_mutex(unsigned int flags)  				ret = -EINVAL;  				break;  			} -			cpu_relax(); +			cond_resched();  		} while (time_before(jiffies, timeout));  	} else {  		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); @@ -627,7 +627,7 @@ static int __init test_ww_mutex_init(void)  	if (ret)  		return ret; -	ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); +	ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);  	if (ret)  		return ret; diff --git a/kernel/memremap.c b/kernel/memremap.c index 06123234f118..07e85e5229da 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,11 +247,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)  	align_start = res->start & ~(SECTION_SIZE - 1);  	align_size = ALIGN(resource_size(res), SECTION_SIZE); -	lock_device_hotplug();  	mem_hotplug_begin();  	arch_remove_memory(align_start, align_size);  	mem_hotplug_done(); -	unlock_device_hotplug();  	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);  	pgmap_radix_release(res); @@ -364,11 +362,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,  	if (error)  		goto err_pfn_remap; -	lock_device_hotplug();  	mem_hotplug_begin();  	error = arch_add_memory(nid, align_start, align_size, true);  	mem_hotplug_done(); -	unlock_device_hotplug();  	if (error)  		goto err_add_memory; diff --git a/kernel/padata.c b/kernel/padata.c index 05316c9f32da..3202aa17492c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -186,19 +186,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)  	reorder = &next_queue->reorder; +	spin_lock(&reorder->lock);  	if (!list_empty(&reorder->list)) {  		padata = list_entry(reorder->list.next,  				    struct padata_priv, list); -		spin_lock(&reorder->lock);  		list_del_init(&padata->list);  		atomic_dec(&pd->reorder_objects); -		spin_unlock(&reorder->lock);  		pd->processed++; +		spin_unlock(&reorder->lock);  		goto out;  	} +	spin_unlock(&reorder->lock);  	if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {  		padata = ERR_PTR(-ENODATA); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0af928712174..266ddcc1d8bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)  	WARN_ON(!task->ptrace || task->parent != current); +	/* +	 * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. +	 * Recheck state under the lock to close this race. +	 */  	spin_lock_irq(&task->sighand->siglock); -	if (__fatal_signal_pending(task)) -		wake_up_state(task, __TASK_TRACED); -	else -		task->state = TASK_TRACED; +	if (task->state == __TASK_TRACED) { +		if (__fatal_signal_pending(task)) +			wake_up_state(task, __TASK_TRACED); +		else +			task->state = TASK_TRACED; +	}  	spin_unlock_irq(&task->sighand->siglock);  } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a08795e21628..00a45c45beca 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);  static int __sched_clock_stable_early = 1;  /* - * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset + * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset   */ -static __read_mostly u64 raw_offset; -static __read_mostly u64 gtod_offset; +__read_mostly u64 __sched_clock_offset; +static __read_mostly u64 __gtod_offset;  struct sched_clock_data {  	u64			tick_raw; @@ -131,17 +131,24 @@ static void __set_sched_clock_stable(void)  	/*  	 * Attempt to make the (initial) unstable->stable transition continuous.  	 */ -	raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); +	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);  	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", -			scd->tick_gtod, gtod_offset, -			scd->tick_raw,  raw_offset); +			scd->tick_gtod, __gtod_offset, +			scd->tick_raw,  __sched_clock_offset);  	static_branch_enable(&__sched_clock_stable);  	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);  } -static void __clear_sched_clock_stable(struct work_struct *work) +static void __sched_clock_work(struct work_struct *work) +{ +	static_branch_disable(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __sched_clock_work); + +static void __clear_sched_clock_stable(void)  {  	struct sched_clock_data *scd = this_scd(); @@ -154,17 +161,17 @@ static void __clear_sched_clock_stable(struct work_struct *work)  	 *  	 * Still do what we can.  	 */ -	gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); +	__gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);  	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", -			scd->tick_gtod, gtod_offset, -			scd->tick_raw,  raw_offset); +			scd->tick_gtod, __gtod_offset, +			scd->tick_raw,  __sched_clock_offset); -	static_branch_disable(&__sched_clock_stable);  	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); -} -static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); +	if (sched_clock_stable()) +		schedule_work(&sched_clock_work); +}  void clear_sched_clock_stable(void)  { @@ -173,7 +180,7 @@ void clear_sched_clock_stable(void)  	smp_mb(); /* matches sched_clock_init_late() */  	if (sched_clock_running == 2) -		schedule_work(&sched_clock_work); +		__clear_sched_clock_stable();  }  void sched_clock_init_late(void) @@ -214,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y)   */  static u64 sched_clock_local(struct sched_clock_data *scd)  { -	u64 now, clock, old_clock, min_clock, max_clock; +	u64 now, clock, old_clock, min_clock, max_clock, gtod;  	s64 delta;  again: @@ -231,9 +238,10 @@ again:  	 *		      scd->tick_gtod + TICK_NSEC);  	 */ -	clock = scd->tick_gtod + gtod_offset + delta; -	min_clock = wrap_max(scd->tick_gtod, old_clock); -	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); +	gtod = scd->tick_gtod + __gtod_offset; +	clock = gtod + delta; +	min_clock = wrap_max(gtod, old_clock); +	max_clock = wrap_max(old_clock, gtod + TICK_NSEC);  	clock = wrap_max(clock, min_clock);  	clock = wrap_min(clock, max_clock); @@ -317,7 +325,7 @@ u64 sched_clock_cpu(int cpu)  	u64 clock;  	if (sched_clock_stable()) -		return sched_clock() + raw_offset; +		return sched_clock() + __sched_clock_offset;  	if (unlikely(!sched_clock_running))  		return 0ull; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 956383844116..3b31fc05a0f1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3287,10 +3287,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)  	struct task_struct *p;  	/* -	 * Optimization: we know that if all tasks are in -	 * the fair class we can call that function directly: +	 * Optimization: we know that if all tasks are in the fair class we can +	 * call that function directly, but only if the @prev task wasn't of a +	 * higher scheduling class, because otherwise those loose the +	 * opportunity to pull in more work from other CPUs.  	 */ -	if (likely(rq->nr_running == rq->cfs.h_nr_running)) { +	if (likely((prev->sched_class == &idle_sched_class || +		    prev->sched_class == &fair_sched_class) && +		   rq->nr_running == rq->cfs.h_nr_running)) { +  		p = fair_sched_class.pick_next_task(rq, prev, rf);  		if (unlikely(p == RETRY_TASK))  			goto again; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8f8de3d4d6b7..54c577578da6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -36,6 +36,7 @@ struct sugov_policy {  	u64 last_freq_update_time;  	s64 freq_update_delay_ns;  	unsigned int next_freq; +	unsigned int cached_raw_freq;  	/* The next fields are only needed if fast switch cannot be used. */  	struct irq_work irq_work; @@ -52,7 +53,6 @@ struct sugov_cpu {  	struct update_util_data update_util;  	struct sugov_policy *sg_policy; -	unsigned int cached_raw_freq;  	unsigned long iowait_boost;  	unsigned long iowait_boost_max;  	u64 last_update; @@ -116,7 +116,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,  /**   * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @sg_policy: schedutil policy object to compute the new frequency for.   * @util: Current CPU utilization.   * @max: CPU capacity.   * @@ -136,19 +136,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,   * next_freq (as calculated above) is returned, subject to policy min/max and   * cpufreq driver limitations.   */ -static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, -				  unsigned long max) +static unsigned int get_next_freq(struct sugov_policy *sg_policy, +				  unsigned long util, unsigned long max)  { -	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	struct cpufreq_policy *policy = sg_policy->policy;  	unsigned int freq = arch_scale_freq_invariant() ?  				policy->cpuinfo.max_freq : policy->cur;  	freq = (freq + (freq >> 2)) * util / max; -	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) +	if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)  		return sg_policy->next_freq; -	sg_cpu->cached_raw_freq = freq; +	sg_policy->cached_raw_freq = freq;  	return cpufreq_driver_resolve_freq(policy, freq);  } @@ -213,7 +212,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	} else {  		sugov_get_util(&util, &max);  		sugov_iowait_boost(sg_cpu, &util, &max); -		next_f = get_next_freq(sg_cpu, util, max); +		next_f = get_next_freq(sg_policy, util, max);  	}  	sugov_update_commit(sg_policy, time, next_f);  } @@ -267,7 +266,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,  		sugov_iowait_boost(j_sg_cpu, &util, &max);  	} -	return get_next_freq(sg_cpu, util, max); +	return get_next_freq(sg_policy, util, max);  }  static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -580,25 +579,19 @@ static int sugov_start(struct cpufreq_policy *policy)  	sg_policy->next_freq = UINT_MAX;  	sg_policy->work_in_progress = false;  	sg_policy->need_freq_update = false; +	sg_policy->cached_raw_freq = 0;  	for_each_cpu(cpu, policy->cpus) {  		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); +		memset(sg_cpu, 0, sizeof(*sg_cpu));  		sg_cpu->sg_policy = sg_policy; -		if (policy_is_shared(policy)) { -			sg_cpu->util = 0; -			sg_cpu->max = 0; -			sg_cpu->flags = SCHED_CPUFREQ_RT; -			sg_cpu->last_update = 0; -			sg_cpu->cached_raw_freq = 0; -			sg_cpu->iowait_boost = 0; -			sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; -			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, -						     sugov_update_shared); -		} else { -			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, -						     sugov_update_single); -		} +		sg_cpu->flags = SCHED_CPUFREQ_RT; +		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; +		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, +					     policy_is_shared(policy) ? +							sugov_update_shared : +							sugov_update_single);  	}  	return 0;  } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 99b2c33a9fbc..a2ce59015642 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,   *   * This function returns true if:   * - *   runtime / (deadline - t) > dl_runtime / dl_period , + *   runtime / (deadline - t) > dl_runtime / dl_deadline ,   *   * IOW we can't recycle current parameters.   * - * Notice that the bandwidth check is done against the period. For + * Notice that the bandwidth check is done against the deadline. For   * task with deadline equal to period this is the same of using - * dl_deadline instead of dl_period in the equation above. + * dl_period instead of dl_deadline in the equation above.   */  static bool dl_entity_overflow(struct sched_dl_entity *dl_se,  			       struct sched_dl_entity *pi_se, u64 t) @@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,  	 * of anything below microseconds resolution is actually fiction  	 * (but still we want to give the user that illusion >;).  	 */ -	left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); +	left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);  	right = ((dl_se->deadline - t) >> DL_SCALE) *  		(pi_se->dl_runtime >> DL_SCALE); @@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,  	}  } +static inline u64 dl_next_period(struct sched_dl_entity *dl_se) +{ +	return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period; +} +  /*   * If the entity depleted all its runtime, and if we want it to sleep   * while waiting for some new execution time to become available, we - * set the bandwidth enforcement timer to the replenishment instant + * set the bandwidth replenishment timer to the replenishment instant   * and try to activate it.   *   * Notice that it is important for the caller to know if the timer @@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)  	 * that it is actually coming from rq->clock and not from  	 * hrtimer's time base reading.  	 */ -	act = ns_to_ktime(dl_se->deadline); +	act = ns_to_ktime(dl_next_period(dl_se));  	now = hrtimer_cb_get_time(timer);  	delta = ktime_to_ns(now) - rq_clock(rq);  	act = ktime_add_ns(act, delta); @@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)  		lockdep_unpin_lock(&rq->lock, rf.cookie);  		rq = dl_task_offline_migration(rq, p);  		rf.cookie = lockdep_pin_lock(&rq->lock); +		update_rq_clock(rq);  		/*  		 * Now that the task has been migrated to the new RQ and we @@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)  	timer->function = dl_task_timer;  } +/* + * During the activation, CBS checks if it can reuse the current task's + * runtime and period. If the deadline of the task is in the past, CBS + * cannot use the runtime, and so it replenishes the task. This rule + * works fine for implicit deadline tasks (deadline == period), and the + * CBS was designed for implicit deadline tasks. However, a task with + * constrained deadline (deadine < period) might be awakened after the + * deadline, but before the next period. In this case, replenishing the + * task would allow it to run for runtime / deadline. As in this case + * deadline < period, CBS enables a task to run for more than the + * runtime / period. In a very loaded system, this can cause a domino + * effect, making other tasks miss their deadlines. + * + * To avoid this problem, in the activation of a constrained deadline + * task after the deadline but before the next period, throttle the + * task and set the replenishing timer to the begin of the next period, + * unless it is boosted. + */ +static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) +{ +	struct task_struct *p = dl_task_of(dl_se); +	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + +	if (dl_time_before(dl_se->deadline, rq_clock(rq)) && +	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { +		if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) +			return; +		dl_se->dl_throttled = 1; +	} +} +  static  int dl_runtime_exceeded(struct sched_dl_entity *dl_se)  { @@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)  	__dequeue_dl_entity(dl_se);  } +static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) +{ +	return dl_se->dl_deadline < dl_se->dl_period; +} +  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  {  	struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)  	}  	/* +	 * Check if a constrained deadline task was activated +	 * after the deadline but before the next period. +	 * If that is the case, the task will be throttled and +	 * the replenishment timer will be set to the next period. +	 */ +	if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) +		dl_check_constrained_dl(&p->dl); + +	/*  	 * If p is throttled, we do nothing. In fact, if it exhausted  	 * its budget it needs a replenishment and, since it now is on  	 * its rq, the bandwidth timer callback (which clearly has not diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e88b35ac157..dea138964b91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5799,7 +5799,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t  	 * Due to large variance we need a large fuzz factor; hackbench in  	 * particularly is sensitive here.  	 */ -	if ((avg_idle / 512) < avg_cost) +	if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)  		return -1;  	time = local_clock(); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..1b3c8189b286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -51,6 +51,11 @@ SCHED_FEAT(NONTASK_CAPACITY, true)   */  SCHED_FEAT(TTWU_QUEUE, true) +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) +  #ifdef HAVE_RT_PUSH_IPI  /*   * In order to avoid a thundering herd attack of CPUs that are diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 7296b7308eca..f15fb2bdbc0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -169,7 +169,7 @@ static inline int calc_load_write_idx(void)  	 * If the folding window started, make sure we start writing in the  	 * next idle-delta.  	 */ -	if (!time_before(jiffies, calc_load_update)) +	if (!time_before(jiffies, READ_ONCE(calc_load_update)))  		idx++;  	return idx & 1; @@ -202,8 +202,9 @@ void calc_load_exit_idle(void)  	struct rq *this_rq = this_rq();  	/* -	 * If we're still before the sample window, we're done. +	 * If we're still before the pending sample window, we're done.  	 */ +	this_rq->calc_load_update = READ_ONCE(calc_load_update);  	if (time_before(jiffies, this_rq->calc_load_update))  		return; @@ -212,7 +213,6 @@ void calc_load_exit_idle(void)  	 * accounted through the nohz accounting, so skip the entire deal and  	 * sync up for the next window.  	 */ -	this_rq->calc_load_update = calc_load_update;  	if (time_before(jiffies, this_rq->calc_load_update + 10))  		this_rq->calc_load_update += LOAD_FREQ;  } @@ -308,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,   */  static void calc_global_nohz(void)  { +	unsigned long sample_window;  	long delta, active, n; -	if (!time_before(jiffies, calc_load_update + 10)) { +	sample_window = READ_ONCE(calc_load_update); +	if (!time_before(jiffies, sample_window + 10)) {  		/*  		 * Catch-up, fold however many we are behind still  		 */ -		delta = jiffies - calc_load_update - 10; +		delta = jiffies - sample_window - 10;  		n = 1 + (delta / LOAD_FREQ);  		active = atomic_long_read(&calc_load_tasks); @@ -324,7 +326,7 @@ static void calc_global_nohz(void)  		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);  		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); -		calc_load_update += n * LOAD_FREQ; +		WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);  	}  	/* @@ -352,9 +354,11 @@ static inline void calc_global_nohz(void) { }   */  void calc_global_load(unsigned long ticks)  { +	unsigned long sample_window;  	long active, delta; -	if (time_before(jiffies, calc_load_update + 10)) +	sample_window = READ_ONCE(calc_load_update); +	if (time_before(jiffies, sample_window + 10))  		return;  	/* @@ -371,7 +375,7 @@ void calc_global_load(unsigned long ticks)  	avenrun[1] = calc_load(avenrun[1], EXP_5, active);  	avenrun[2] = calc_load(avenrun[2], EXP_15, active); -	calc_load_update += LOAD_FREQ; +	WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);  	/*  	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4d2ea6f25568..b8c84c6dee64 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -242,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)  }  EXPORT_SYMBOL(prepare_to_wait_event); +/* + * Note! These two wait functions are entered with the + * wait-queue lock held (and interrupts off in the _irq + * case), so there is no race with testing the wakeup + * condition in the caller before they add the wait + * entry to the wake queue. + */ +int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +{ +	if (likely(list_empty(&wait->task_list))) +		__add_wait_queue_tail(wq, wait); + +	set_current_state(TASK_INTERRUPTIBLE); +	if (signal_pending(current)) +		return -ERESTARTSYS; + +	spin_unlock(&wq->lock); +	schedule(); +	spin_lock(&wq->lock); +	return 0; +} +EXPORT_SYMBOL(do_wait_intr); + +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +{ +	if (likely(list_empty(&wait->task_list))) +		__add_wait_queue_tail(wq, wait); + +	set_current_state(TASK_INTERRUPTIBLE); +	if (signal_pending(current)) +		return -ERESTARTSYS; + +	spin_unlock_irq(&wq->lock); +	schedule(); +	spin_lock_irq(&wq->lock); +	return 0; +} +EXPORT_SYMBOL(do_wait_intr_irq); +  /**   * finish_wait - clean up after waiting in a queue   * @q: waitqueue waited on diff --git a/kernel/sysctl.c b/kernel/sysctl.c index acf0a5a06da7..8c8714fcb53c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2133,9 +2133,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,  	if (write) {  		if (*negp)  			return -EINVAL; +		if (*lvalp > UINT_MAX) +			return -EINVAL;  		*valp = *lvalp;  	} else {  		unsigned int val = *valp; +		*negp = false;  		*lvalp = (unsigned long)val;  	}  	return 0; diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7906b3f0c41a..497719127bf9 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second)  	shift_hz += cycles_per_tick/2;  	do_div(shift_hz, cycles_per_tick);  	/* Calculate nsec_per_tick using shift_hz */ -	nsec_per_tick = (u64)TICK_NSEC << 8; +	nsec_per_tick = (u64)NSEC_PER_SEC << 8;  	nsec_per_tick += (u32)shift_hz/2;  	do_div(nsec_per_tick, (u32)shift_hz); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d5038005eb5d..d4a06e714645 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -429,7 +429,7 @@ config BLK_DEV_IO_TRACE  	  If unsure, say N. -config KPROBE_EVENT +config KPROBE_EVENTS  	depends on KPROBES  	depends on HAVE_REGS_AND_STACK_ACCESS_API  	bool "Enable kprobes-based dynamic events" @@ -447,7 +447,7 @@ config KPROBE_EVENT  	  This option is also required by perf-probe subcommand of perf tools.  	  If you want to use perf tools, this option is strongly recommended. -config UPROBE_EVENT +config UPROBE_EVENTS  	bool "Enable uprobes-based dynamic events"  	depends on ARCH_SUPPORTS_UPROBES  	depends on MMU @@ -466,7 +466,7 @@ config UPROBE_EVENT  config BPF_EVENTS  	depends on BPF_SYSCALL -	depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS +	depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS  	bool  	default y  	help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e57980845549..90f2701d92a7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o  obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o  obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o  obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o -obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o  obj-$(CONFIG_TRACEPOINTS) += power-traces.o  ifeq ($(CONFIG_PM),y)  obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o @@ -66,7 +66,7 @@ ifeq ($(CONFIG_TRACING),y)  obj-$(CONFIG_KGDB_KDB) += trace_kdb.o  endif  obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o -obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o  obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d1597c9ee30..dd3e91d68dc7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3755,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)  	ftrace_probe_registered = 1;  } -static void __disable_ftrace_function_probe(void) +static bool __disable_ftrace_function_probe(void)  {  	int i;  	if (!ftrace_probe_registered) -		return; +		return false;  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i];  		if (hhd->first) -			return; +			return false;  	}  	/* no more funcs left */  	ftrace_shutdown(&trace_probe_ops, 0);  	ftrace_probe_registered = 0; +	return true;  } @@ -3901,6 +3902,7 @@ static void  __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  				  void *data, int flags)  { +	struct ftrace_ops_hash old_hash_ops;  	struct ftrace_func_entry *rec_entry;  	struct ftrace_func_probe *entry;  	struct ftrace_func_probe *p; @@ -3912,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	struct hlist_node *tmp;  	char str[KSYM_SYMBOL_LEN];  	int i, ret; +	bool disabled;  	if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))  		func_g.search = NULL; @@ -3930,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	mutex_lock(&trace_probe_ops.func_hash->regex_lock); +	old_hash_ops.filter_hash = old_hash; +	/* Probes only have filters */ +	old_hash_ops.notrace_hash = NULL; +  	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);  	if (!hash)  		/* Hmm, should report this somehow */ @@ -3967,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		}  	}  	mutex_lock(&ftrace_lock); -	__disable_ftrace_function_probe(); +	disabled = __disable_ftrace_function_probe();  	/*  	 * Remove after the disable is called. Otherwise, if the last  	 * probe is removed, a null hash means *all enabled*.  	 */  	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + +	/* still need to update the function call sites */ +	if (ftrace_enabled && !disabled) +		ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, +				       &old_hash_ops);  	synchronize_sched();  	if (!ret)  		free_ftrace_hash_rcu(old_hash); @@ -4416,16 +4428,24 @@ static int __init set_graph_notrace_function(char *str)  }  __setup("ftrace_graph_notrace=", set_graph_notrace_function); +static int __init set_graph_max_depth_function(char *str) +{ +	if (!str) +		return 0; +	fgraph_max_depth = simple_strtoul(str, NULL, 0); +	return 1; +} +__setup("ftrace_graph_max_depth=", set_graph_max_depth_function); +  static void __init set_ftrace_early_graph(char *buf, int enable)  {  	int ret;  	char *func;  	struct ftrace_hash *hash; -	if (enable) -		hash = ftrace_graph_hash; -	else -		hash = ftrace_graph_notrace_hash; +	hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); +	if (WARN_ON(!hash)) +		return;  	while (buf) {  		func = strsep(&buf, ","); @@ -4435,6 +4455,11 @@ static void __init set_ftrace_early_graph(char *buf, int enable)  			printk(KERN_DEBUG "ftrace: function %s not "  					  "traceable\n", func);  	} + +	if (enable) +		ftrace_graph_hash = hash; +	else +		ftrace_graph_notrace_hash = hash;  }  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -5488,7 +5513,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,   * Normally the mcount trampoline will call the ops->func, but there   * are times that it should not. For example, if the ops does not   * have its own recursion protection, then it should call the - * ftrace_ops_recurs_func() instead. + * ftrace_ops_assist_func() instead.   *   * Returns the function that the trampoline should call for @ops.   */ @@ -5541,6 +5566,15 @@ static void clear_ftrace_pids(struct trace_array *tr)  	trace_free_pid_list(pid_list);  } +void ftrace_clear_pids(struct trace_array *tr) +{ +	mutex_lock(&ftrace_lock); + +	clear_ftrace_pids(tr); + +	mutex_unlock(&ftrace_lock); +} +  static void ftrace_pid_reset(struct trace_array *tr)  {  	mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96fc3c043ad6..ca47a4fa2986 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3405,11 +3405,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);  int ring_buffer_iter_empty(struct ring_buffer_iter *iter)  {  	struct ring_buffer_per_cpu *cpu_buffer; +	struct buffer_page *reader; +	struct buffer_page *head_page; +	struct buffer_page *commit_page; +	unsigned commit;  	cpu_buffer = iter->cpu_buffer; -	return iter->head_page == cpu_buffer->commit_page && -		iter->head == rb_commit_index(cpu_buffer); +	/* Remember, trace recording is off when iterator is in use */ +	reader = cpu_buffer->reader_page; +	head_page = cpu_buffer->head_page; +	commit_page = cpu_buffer->commit_page; +	commit = rb_page_commit(commit_page); + +	return ((iter->head_page == commit_page && iter->head == commit) || +		(iter->head_page == reader && commit_page == head_page && +		 head_page->read == commit && +		 iter->head == rb_page_commit(cpu_buffer->reader_page)));  }  EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -4826,9 +4838,9 @@ static __init int test_ringbuffer(void)  		rb_data[cpu].cnt = cpu;  		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],  						 "rbtester/%d", cpu); -		if (WARN_ON(!rb_threads[cpu])) { +		if (WARN_ON(IS_ERR(rb_threads[cpu]))) {  			pr_cont("FAILED\n"); -			ret = -1; +			ret = PTR_ERR(rb_threads[cpu]);  			goto out_free;  		} @@ -4838,9 +4850,9 @@ static __init int test_ringbuffer(void)  	/* Now create the rb hammer! */  	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); -	if (WARN_ON(!rb_hammer)) { +	if (WARN_ON(IS_ERR(rb_hammer))) {  		pr_cont("FAILED\n"); -		ret = -1; +		ret = PTR_ERR(rb_hammer);  		goto out_free;  	} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 707445ceb7ef..0ad75e9698f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4341,22 +4341,22 @@ static const char readme_msg[] =  	"\t\t\t  traces\n"  #endif  #endif /* CONFIG_STACK_TRACER */ -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS  	"  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"  	"\t\t\t  Write into this file to define/undefine new trace events.\n"  #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS  	"  uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"  	"\t\t\t  Write into this file to define/undefine new trace events.\n"  #endif -#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)  	"\t  accepts: event-definitions (one definition per line)\n"  	"\t   Format: p|r[:[<group>/]<event>] <place> [<args>]\n"  	"\t           -:[<group>/]<event>\n" -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS  	"\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"  #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS  	"\t    place: <path>:<offset>\n"  #endif  	"\t     args: <name>=fetcharg[:type]\n" @@ -6733,11 +6733,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,  		return ret;   out_reg: -	ret = register_ftrace_function_probe(glob, ops, count); +	ret = alloc_snapshot(&global_trace); +	if (ret < 0) +		goto out; -	if (ret >= 0) -		alloc_snapshot(&global_trace); +	ret = register_ftrace_function_probe(glob, ops, count); + out:  	return ret < 0 ? ret : 0;  } @@ -7402,6 +7404,7 @@ static int instance_rmdir(const char *name)  	tracing_set_nop(tr);  	event_trace_del_tracer(tr); +	ftrace_clear_pids(tr);  	ftrace_destroy_function_files(tr);  	tracefs_remove_recursive(tr->dir);  	free_trace_buffers(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ae1cce91fead..d19d52d600d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -896,6 +896,7 @@ int using_ftrace_ops_list_func(void);  void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);  void ftrace_init_tracefs_toplevel(struct trace_array *tr,  				  struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr);  #else  static inline int ftrace_trace_task(struct trace_array *tr)  { @@ -914,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }  static inline void ftrace_reset_array_ops(struct trace_array *tr) { }  static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }  static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { }  /* ftace_func_t type is not defined, use macro instead of static inline */  #define ftrace_init_array_ops(tr, func) do { } while (0)  #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0c0ae54d44c6..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,7 +248,7 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\  #define FETCH_TYPE_STRING	0  #define FETCH_TYPE_STRSIZE	1 -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS  struct symbol_cache;  unsigned long update_symbol_cache(struct symbol_cache *sc);  void free_symbol_cache(struct symbol_cache *sc); @@ -278,7 +278,7 @@ alloc_symbol_cache(const char *sym, long offset)  {  	return NULL;  } -#endif /* CONFIG_KPROBE_EVENT */ +#endif /* CONFIG_KPROBE_EVENTS */  struct probe_arg {  	struct fetch_param	fetch; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1d68b5b7ad41..5fb1f2c87e6b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -65,7 +65,7 @@ void stack_trace_print(void)  }  /* - * When arch-specific code overides this function, the following + * When arch-specific code overrides this function, the following   * data should be filled up, assuming stack_trace_max_lock is held to   * prevent concurrent updates.   *     stack_trace_index[] diff --git a/kernel/ucount.c b/kernel/ucount.c index 62630a40ab3a..b4eeee03934f 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -144,7 +144,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)  		new->ns = ns;  		new->uid = uid; -		atomic_set(&new->count, 0); +		new->count = 0;  		spin_lock_irq(&ucounts_lock);  		ucounts = find_ucounts(ns, uid, hashent); @@ -155,8 +155,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)  			ucounts = new;  		}  	} -	if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) +	if (ucounts->count == INT_MAX)  		ucounts = NULL; +	else +		ucounts->count += 1;  	spin_unlock_irq(&ucounts_lock);  	return ucounts;  } @@ -165,13 +167,15 @@ static void put_ucounts(struct ucounts *ucounts)  {  	unsigned long flags; -	if (atomic_dec_and_test(&ucounts->count)) { -		spin_lock_irqsave(&ucounts_lock, flags); +	spin_lock_irqsave(&ucounts_lock, flags); +	ucounts->count -= 1; +	if (!ucounts->count)  		hlist_del_init(&ucounts->node); -		spin_unlock_irqrestore(&ucounts_lock, flags); +	else +		ucounts = NULL; +	spin_unlock_irqrestore(&ucounts_lock, flags); -		kfree(ucounts); -	} +	kfree(ucounts);  }  static inline bool atomic_inc_below(atomic_t *v, int u) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 072cbc9b175d..c0168b7da1ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,  	struct timer_list *timer = &dwork->timer;  	struct work_struct *work = &dwork->work; +	WARN_ON_ONCE(!wq);  	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||  		     timer->data != (unsigned long)dwork);  	WARN_ON_ONCE(timer_pending(timer));  | 
