11.1/releng-p13.diff - freebsd-patches - Rivoreo Source Code Repositories

 Index: UPDATING
 ===================================================================
 --- UPDATING	(版本 337395)
 +++ UPDATING	(版本 337828)
 @@ -16,6 +16,21 @@
  the tip of head, and then rebuild without this option. The bootstrap process
  from older version of current across the gcc/clang cutover is a bit fragile.

 +20180814	p13	FreeBSD-SA-18:08.tcp [revised]
 +			FreeBSD-SA-18:09.l1tf
 +			FreeBSD-SA-18:10.ip
 +			FreeBSD-SA-18:11.hostapd
 +
 +	Revise manual pages. [SA-18:08.tcp]
 +
 +	Fix L1 Terminal Fault (L1TF) kernel information disclosure.
 +	[SA-18:09.l1tf]
 +
 +	Fix resource exhaustion in IP fragment reassembly. [SA-18:10.ip]
 +
 +	Fix unauthenticated EAPOL-Key decryption vulnerability.
 +	[SA-18:11.hostapd]
 +
  20180806	p12	FreeBSD-SA-18:08.tcp

  	Fix resource exhaustion in TCP reassembly.
 Index: contrib/wpa/src/rsn_supp/wpa.c
 ===================================================================
 --- contrib/wpa/src/rsn_supp/wpa.c	(版本 337395)
 +++ contrib/wpa/src/rsn_supp/wpa.c	(版本 337828)
 @@ -2027,6 +2027,17 @@

  	if ((sm->proto == WPA_PROTO_RSN || sm->proto == WPA_PROTO_OSEN) &&
  	    (key_info & WPA_KEY_INFO_ENCR_KEY_DATA)) {
 +		/*
 +		 * Only decrypt the Key Data field if the frame's authenticity
 +		 * was verified. When using AES-SIV (FILS), the MIC flag is not
 +		 * set, so this check should only be performed if mic_len != 0
 +		 * which is the case in this code branch.
 +		 */
 +		if (!(key_info & WPA_KEY_INFO_MIC)) {
 +			wpa_msg(sm->ctx->msg_ctx, MSG_WARNING,
 +				"WPA: Ignore EAPOL-Key with encrypted but unauthenticated data");
 +			goto out;
 +		}
  		if (wpa_supplicant_decrypt_key_data(sm, key, ver, key_data,
  						    &key_data_len))
  			goto out;
 Index: share/man/man4/inet.4
 ===================================================================
 --- share/man/man4/inet.4	(版本 337395)
 +++ share/man/man4/inet.4	(版本 337828)
 @@ -28,7 +28,7 @@
  .\"     From: @(#)inet.4	8.1 (Berkeley) 6/5/93
  .\" $FreeBSD$
  .\"
 -.Dd Feb 4, 2016
 +.Dd August 14, 2018
  .Dt INET 4
  .Os
  .Sh NAME
 @@ -229,15 +229,38 @@
  cycle greatly.
  Default is 0 (sequential IP IDs).
  IPv6 flow IDs and fragment IDs are always random.
 +.It Va ip.maxfrags
 +Integer: maximum number of fragments the host will accept and simultaneously
 +hold across all reassembly queues in all VNETs.
 +If set to 0, reassembly is disabled.
 +If set to -1, this limit is not applied.
 +This limit is recalculated when the number of mbuf clusters is changed.
 +This is a global limit.
  .It Va ip.maxfragpackets
 -Integer: maximum number of fragmented packets the host will accept and hold
 -in the reassembling queue simultaneously.
 -0 means that the host will not accept any fragmented packets.
 -\-1 means that the host will accept as many fragmented packets as it receives.
 +Integer: maximum number of fragmented packets the host will accept and
 +simultaneously hold in the reassembly queue for a particular VNET.
 +0 means that the host will not accept any fragmented packets for that VNET.
 +\-1 means that the host will not apply this limit for that VNET.
 +This limit is recalculated when the number of mbuf clusters is changed.
 +This is a per-VNET limit.
 +.It Va ip.maxfragbucketsize
 +Integer: maximum number of reassembly queues per bucket.
 +Fragmented packets are hashed to buckets.
 +Each bucket has a list of reassembly queues.
 +The system must compare the incoming packets to the existing reassembly queues
 +in the bucket to find a matching reassembly queue.
 +To preserve system resources, the system limits the number of reassembly
 +queues allowed in each bucket.
 +This limit is recalculated when the number of mbuf clusters is changed or
 +when the value of
 +.Va ip.maxfragpackets
 +changes.
 +This is a per-VNET limit.
  .It Va ip.maxfragsperpacket
  Integer: maximum number of fragments the host will accept and hold
 -in the reassembling queue for a packet.
 -0 means that the host will not accept any fragmented packets.
 +in the reassembly queue for a packet.
 +0 means that the host will not accept any fragmented packets for the VNET.
 +This is a per-VNET limit.
  .El
  .Sh SEE ALSO
  .Xr ioctl 2 ,
 Index: share/man/man4/inet6.4
 ===================================================================
 --- share/man/man4/inet6.4	(版本 337395)
 +++ share/man/man4/inet6.4	(版本 337828)
 @@ -29,7 +29,7 @@
  .\"
  .\" $FreeBSD$
  .\"
 -.Dd September 2, 2009
 +.Dd August 14, 2018
  .Dt INET6 4
  .Os
  .Sh NAME
 @@ -219,12 +219,41 @@
  This value applies to all the transport protocols on top of
  .Tn IPv6 .
  There are APIs to override the value.
 +.It Dv IPV6CTL_MAXFRAGS
 +.Pq ip6.maxfrags
 +Integer: maximum number of fragments the host will accept and simultaneously
 +hold across all reassembly queues in all VNETs.
 +If set to 0, fragment reassembly is disabled.
 +If set to -1, this limit is not applied.
 +This limit is recalculated when the number of mbuf clusters is changed.
 +This is a global limit.
  .It Dv IPV6CTL_MAXFRAGPACKETS
  .Pq ip6.maxfragpackets
 -Integer: default maximum number of fragmented packets the node will accept.
 -0 means that the node will not accept any fragmented packets.
 --1 means that the node will accept as many fragmented packets as it receives.
 -The flag is provided basically for avoiding possible DoS attacks.
 +Integer: maximum number of fragmented packets the node will accept and
 +simultaneously hold in the reassembly queue for a particular VNET.
 +0 means that the node will not accept any fragmented packets for that VNET.
 +-1 means that the node will not apply this limit for that VNET.
 +This limit is recalculated when the number of mbuf clusters is changed.
 +This is a per-VNET limit.
 +.It Dv IPV6CTL_MAXFRAGBUCKETSIZE
 +.Pq ip6.maxfragbucketsize
 +Integer: maximum number of reassembly queues per bucket.
 +Fragmented packets are hashed to buckets.
 +Each bucket has a list of reassembly queues.
 +The system must compare the incoming packets to the existing reassembly queues
 +in the bucket to find a matching reassembly queue.
 +To preserve system resources, the system limits the number of reassembly
 +queues allowed in each bucket.
 +This limit is recalculated when the number of mbuf clusters is changed or
 +when the value of
 +.Va ip6.maxfragpackets
 +changes.
 +This is a per-VNET limit.
 +.It Dv IPV6CTL_MAXFRAGSPERPACKET
 +.Pq ip6.maxfragsperpacket
 +Integer: maximum number of fragments the host will accept and hold in the
 +ressembly queue for a packet.
 +This is a per-VNET limit.
  .It Dv IPV6CTL_ACCEPT_RTADV
  .Pq ip6.accept_rtadv
  Boolean: the default value of a per-interface flag to
 Index: share/man/man4/tcp.4
 ===================================================================
 --- share/man/man4/tcp.4	(版本 337395)
 +++ share/man/man4/tcp.4	(版本 337828)
 @@ -34,7 +34,7 @@
  .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
  .\" $FreeBSD$
  .\"
 -.Dd February 6, 2017
 +.Dd August 6, 2018
  .Dt TCP 4
  .Os
  .Sh NAME
 Index: sys/amd64/amd64/pmap.c
 ===================================================================
 --- sys/amd64/amd64/pmap.c	(版本 337395)
 +++ sys/amd64/amd64/pmap.c	(版本 337828)
 @@ -1206,6 +1206,9 @@
  	vm_size_t s;
  	int error, i, pv_npg;

 +	/* L1TF, reserve page @0 unconditionally */
 +	vm_page_blacklist_add(0, bootverbose);
 +
  	/*
  	 * Initialize the vm page array entries for the kernel pmap's
  	 * page table pages.
 Index: sys/amd64/vmm/intel/vmx.c
 ===================================================================
 --- sys/amd64/vmm/intel/vmx.c	(版本 337395)
 +++ sys/amd64/vmm/intel/vmx.c	(版本 337828)
 @@ -183,6 +183,12 @@
  SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
  	    &vpid_alloc_failed, 0, NULL);

 +static int guest_l1d_flush;
 +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
 +    &guest_l1d_flush, 0, NULL);
 +
 +uint64_t vmx_msr_flush_cmd;
 +
  /*
   * Use the last page below 4GB as the APIC access address. This address is
   * occupied by the boot firmware so it is guaranteed that it will not conflict
 @@ -718,6 +724,12 @@
  		return (error);
  	}

 +	guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0;
 +	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
 +	if (guest_l1d_flush &&
 +	    (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0)
 +		vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D;
 +
  	/*
  	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
  	 */
 Index: sys/amd64/vmm/intel/vmx_genassym.c
 ===================================================================
 --- sys/amd64/vmm/intel/vmx_genassym.c	(版本 337395)
 +++ sys/amd64/vmm/intel/vmx_genassym.c	(版本 337828)
 @@ -36,6 +36,7 @@

  #include <vm/vm.h>
  #include <vm/pmap.h>
 +#include <vm/vm_param.h>

  #include <machine/vmm.h>
  #include "vmx_cpufunc.h"
 @@ -86,3 +87,6 @@

  ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL));
  ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL));
 +
 +ASSYM(PAGE_SIZE, PAGE_SIZE);
 +ASSYM(KERNBASE, KERNBASE);
 Index: sys/amd64/vmm/intel/vmx_support.S
 ===================================================================
 --- sys/amd64/vmm/intel/vmx_support.S	(版本 337395)
 +++ sys/amd64/vmm/intel/vmx_support.S	(版本 337828)
 @@ -28,6 +28,7 @@
   */

  #include <machine/asmacros.h>
 +#include <machine/specialreg.h>

  #include "vmx_assym.h"

 @@ -136,9 +137,47 @@
  	jbe	invept_error		/* Check invept instruction error */

  guest_restore:
 -	cmpl	$0, %edx
 +
 +	/*
 +	 * Flush L1D cache if requested.  Use IA32_FLUSH_CMD MSR if available,
 +	 * otherwise load enough of the data from the zero_region to flush
 +	 * existing L1D content.
 +	 */
 +#define	L1D_FLUSH_SIZE	(64 * 1024)
 +	movl	%edx, %r8d
 +	cmpb	$0, guest_l1d_flush(%rip)
 +	je	after_l1d
 +	movq	vmx_msr_flush_cmd(%rip), %rax
 +	testq	%rax, %rax
 +	jz	1f
 +	movq	%rax, %rdx
 +	shrq	$32, %rdx
 +	movl	$MSR_IA32_FLUSH_CMD, %ecx
 +	wrmsr
 +	jmp	after_l1d
 +1:	movq	$KERNBASE, %r9
 +	movq	$-L1D_FLUSH_SIZE, %rcx
 +	/*
 +	 * pass 1: Preload TLB.
 +	 * Kernel text is mapped using superpages.  TLB preload is
 +	 * done for the benefit of older CPUs which split 2M page
 +	 * into 4k TLB entries.
 +	 */
 +2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 +	addq	$PAGE_SIZE, %rcx
 +	jne	2b
 +	xorl	%eax, %eax
 +	cpuid
 +	movq	$-L1D_FLUSH_SIZE, %rcx
 +	/* pass 2: Read each cache line */
 +3:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 +	addq	$64, %rcx
 +	jne	3b
 +	lfence
 +#undef	L1D_FLUSH_SIZE
 +after_l1d:
 +	cmpl	$0, %r8d
  	je	do_launch
 -
  	VMX_GUEST_RESTORE
  	vmresume
  	/*
 Index: sys/conf/newvers.sh
 ===================================================================
 --- sys/conf/newvers.sh	(版本 337395)
 +++ sys/conf/newvers.sh	(版本 337828)
 @@ -44,7 +44,7 @@

  TYPE="FreeBSD"
  REVISION="11.1"
 -BRANCH="RELEASE-p12"
 +BRANCH="RELEASE-p13"
  if [ -n "${BRANCH_OVERRIDE}" ]; then
  	BRANCH=${BRANCH_OVERRIDE}
  fi
 Index: sys/netinet/ip_reass.c
 ===================================================================
 --- sys/netinet/ip_reass.c	(版本 337395)
 +++ sys/netinet/ip_reass.c	(版本 337828)
 @@ -42,6 +42,7 @@
  #include <sys/hash.h>
  #include <sys/mbuf.h>
  #include <sys/malloc.h>
 +#include <sys/limits.h>
  #include <sys/lock.h>
  #include <sys/mutex.h>
  #include <sys/sysctl.h>
 @@ -63,7 +64,7 @@
  /*
   * Reassembly headers are stored in hash buckets.
   */
 -#define	IPREASS_NHASH_LOG2	6
 +#define	IPREASS_NHASH_LOG2	10
  #define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
  #define	IPREASS_HMASK		(IPREASS_NHASH - 1)

 @@ -70,6 +71,7 @@
  struct ipqbucket {
  	TAILQ_HEAD(ipqhead, ipq) head;
  	struct mtx		 lock;
 +	int			 count;
  };

  static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]);
 @@ -82,6 +84,9 @@
  #define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
  #define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)

 +static VNET_DEFINE(int, ipreass_maxbucketsize);
 +#define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
 +
  void		ipreass_init(void);
  void		ipreass_drain(void);
  void		ipreass_slowtimo(void);
 @@ -89,27 +94,53 @@
  void		ipreass_destroy(void);
  #endif
  static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
 +static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
  static void	ipreass_zone_change(void *);
  static void	ipreass_drain_tomax(void);
 -static void	ipq_free(struct ipqhead *, struct ipq *);
 +static void	ipq_free(struct ipqbucket *, struct ipq *);
  static struct ipq * ipq_reuse(int);

  static inline void
 -ipq_timeout(struct ipqhead *head, struct ipq *fp)
 +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
  {

  	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 -	ipq_free(head, fp);
 +	ipq_free(bucket, fp);
  }

  static inline void
 -ipq_drop(struct ipqhead *head, struct ipq *fp)
 +ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
  {

  	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 -	ipq_free(head, fp);
 +	ipq_free(bucket, fp);
  }

 +/*
 + * By default, limit the number of IP fragments across all reassembly
 + * queues to  1/32 of the total number of mbuf clusters.
 + *
 + * Limit the total number of reassembly queues per VNET to the
 + * IP fragment limit, but ensure the limit will not allow any bucket
 + * to grow above 100 items. (The bucket limit is
 + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
 + * multiplier to reach a 100-item limit.)
 + * The 100-item limit was chosen as brief testing seems to show that
 + * this produces "reasonable" performance on some subset of systems
 + * under DoS attack.
 + */
 +#define	IP_MAXFRAGS		(nmbclusters / 32)
 +#define	IP_MAXFRAGPACKETS	(imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
 +
 +static int		maxfrags;
 +static volatile u_int	nfrags;
 +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
 +    &maxfrags, 0,
 +    "Maximum number of IPv4 fragments allowed across all reassembly queues");
 +SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
 +    __DEVOLATILE(u_int *, &nfrags), 0,
 +    "Current number of IPv4 fragments across all reassembly queues");
 +
  static VNET_DEFINE(uma_zone_t, ipq_zone);
  #define	V_ipq_zone	VNET(ipq_zone)
  SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
 @@ -127,6 +158,10 @@
  SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(maxfragsperpacket), 0,
      "Maximum number of IPv4 fragments allowed per packet");
 +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
 +    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
 +    sysctl_maxfragbucketsize, "I",
 +    "Maximum number of IPv4 fragment reassembly queue entries per bucket");

  /*
   * Take incoming datagram fragment and try to reassemble it into
 @@ -146,9 +181,9 @@
  	struct mbuf *p, *q, *nq, *t;
  	struct ipq *fp;
  	struct ipqhead *head;
 -	int i, hlen, next;
 +	int i, hlen, next, tmpmax;
  	u_int8_t ecn, ecn0;
 -	uint32_t hash;
 +	uint32_t hash, hashkey[3];
  #ifdef	RSS
  	uint32_t rss_hash, rss_type;
  #endif
 @@ -156,8 +191,12 @@
  	/*
  	 * If no reassembling or maxfragsperpacket are 0,
  	 * never accept fragments.
 +	 * Also, drop packet if it would exceed the maximum
 +	 * number of fragments.
  	 */
 -	if (V_noreass == 1 || V_maxfragsperpacket == 0) {
 +	tmpmax = maxfrags;
 +	if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
 +	    (tmpmax >= 0 && nfrags >= (u_int)tmpmax)) {
  		IPSTAT_INC(ips_fragments);
  		IPSTAT_INC(ips_fragdropped);
  		m_freem(m);
 @@ -202,8 +241,12 @@
  	m->m_data += hlen;
  	m->m_len -= hlen;

 -	hash = ip->ip_src.s_addr ^ ip->ip_id;
 -	hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK;
 +	hashkey[0] = ip->ip_src.s_addr;
 +	hashkey[1] = ip->ip_dst.s_addr;
 +	hashkey[2] = (uint32_t)ip->ip_p << 16;
 +	hashkey[2] += ip->ip_id;
 +	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
 +	hash &= IPREASS_HMASK;
  	head = &V_ipq[hash].head;
  	IPQ_LOCK(hash);

 @@ -224,9 +267,12 @@
  	 * If first fragment to arrive, create a reassembly queue.
  	 */
  	if (fp == NULL) {
 -		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 +		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
 +			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
  		if (fp == NULL)
  			fp = ipq_reuse(hash);
 +		if (fp == NULL)
 +			goto dropfrag;
  #ifdef MAC
  		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
  			uma_zfree(V_ipq_zone, fp);
 @@ -236,7 +282,9 @@
  		mac_ipq_create(m, fp);
  #endif
  		TAILQ_INSERT_HEAD(head, fp, ipq_list);
 +		V_ipq[hash].count++;
  		fp->ipq_nfrags = 1;
 +		atomic_add_int(&nfrags, 1);
  		fp->ipq_ttl = IPFRAGTTL;
  		fp->ipq_p = ip->ip_p;
  		fp->ipq_id = ip->ip_id;
 @@ -247,6 +295,7 @@
  		goto done;
  	} else {
  		fp->ipq_nfrags++;
 +		atomic_add_int(&nfrags, 1);
  #ifdef MAC
  		mac_ipq_update(m, fp);
  #endif
 @@ -323,6 +372,7 @@
  		m->m_nextpkt = nq;
  		IPSTAT_INC(ips_fragdropped);
  		fp->ipq_nfrags--;
 +		atomic_subtract_int(&nfrags, 1);
  		m_freem(q);
  	}

 @@ -340,7 +390,7 @@
  	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
  		if (ntohs(GETIP(q)->ip_off) != next) {
  			if (fp->ipq_nfrags > V_maxfragsperpacket)
 -				ipq_drop(head, fp);
 +				ipq_drop(&V_ipq[hash], fp);
  			goto done;
  		}
  		next += ntohs(GETIP(q)->ip_len);
 @@ -348,7 +398,7 @@
  	/* Make sure the last packet didn't have the IP_MF flag */
  	if (p->m_flags & M_IP_FRAG) {
  		if (fp->ipq_nfrags > V_maxfragsperpacket)
 -			ipq_drop(head, fp);
 +			ipq_drop(&V_ipq[hash], fp);
  		goto done;
  	}

 @@ -359,7 +409,7 @@
  	ip = GETIP(q);
  	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
  		IPSTAT_INC(ips_toolong);
 -		ipq_drop(head, fp);
 +		ipq_drop(&V_ipq[hash], fp);
  		goto done;
  	}

 @@ -387,6 +437,7 @@
  	while (m->m_pkthdr.csum_data & 0xffff0000)
  		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
  		    (m->m_pkthdr.csum_data >> 16);
 +	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
  #ifdef MAC
  	mac_ipq_reassemble(fp, m);
  	mac_ipq_destroy(fp);
 @@ -401,6 +452,7 @@
  	ip->ip_src = fp->ipq_src;
  	ip->ip_dst = fp->ipq_dst;
  	TAILQ_REMOVE(head, fp, ipq_list);
 +	V_ipq[hash].count--;
  	uma_zfree(V_ipq_zone, fp);
  	m->m_len += (ip->ip_hl << 2);
  	m->m_data -= (ip->ip_hl << 2);
 @@ -446,8 +498,10 @@

  dropfrag:
  	IPSTAT_INC(ips_fragdropped);
 -	if (fp != NULL)
 +	if (fp != NULL) {
  		fp->ipq_nfrags--;
 +		atomic_subtract_int(&nfrags, 1);
 +	}
  	m_freem(m);
  done:
  	IPQ_UNLOCK(hash);
 @@ -462,21 +516,27 @@
  void
  ipreass_init(void)
  {
 +	int max;

  	for (int i = 0; i < IPREASS_NHASH; i++) {
  		TAILQ_INIT(&V_ipq[i].head);
  		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
  		    MTX_DEF | MTX_DUPOK);
 +		V_ipq[i].count = 0;
  	}
  	V_ipq_hashseed = arc4random();
  	V_maxfragsperpacket = 16;
  	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
  	    NULL, UMA_ALIGN_PTR, 0);
 -	uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
 +	max = IP_MAXFRAGPACKETS;
 +	max = uma_zone_set_max(V_ipq_zone, max);
 +	V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);

 -	if (IS_DEFAULT_VNET(curvnet))
 +	if (IS_DEFAULT_VNET(curvnet)) {
 +		maxfrags = IP_MAXFRAGS;
  		EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
  		    NULL, EVENTHANDLER_PRI_ANY);
 +	}
  }

  /*
 @@ -491,7 +551,7 @@
  		IPQ_LOCK(i);
  		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
  		if (--fp->ipq_ttl == 0)
 -				ipq_timeout(&V_ipq[i].head, fp);
 +				ipq_timeout(&V_ipq[i], fp);
  		IPQ_UNLOCK(i);
  	}
  }
 @@ -506,7 +566,10 @@
  	for (int i = 0; i < IPREASS_NHASH; i++) {
  		IPQ_LOCK(i);
  		while(!TAILQ_EMPTY(&V_ipq[i].head))
 -			ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head));
 +			ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
 +		KASSERT(V_ipq[i].count == 0,
 +		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
 +		    V_ipq[i].count, V_ipq));
  		IPQ_UNLOCK(i);
  	}
  }
 @@ -534,9 +597,23 @@
  static void
  ipreass_drain_tomax(void)
  {
 +	struct ipq *fp;
  	int target;

  	/*
 +	 * Make sure each bucket is under the new limit. If
 +	 * necessary, drop enough of the oldest elements from
 +	 * each bucket to get under the new limit.
 +	 */
 +	for (int i = 0; i < IPREASS_NHASH; i++) {
 +		IPQ_LOCK(i);
 +		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
 +		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
 +			ipq_timeout(&V_ipq[i], fp);
 +		IPQ_UNLOCK(i);
 +	}
 +
 +	/*
  	 * If we are over the maximum number of fragments,
  	 * drain off enough to get down to the new limit,
  	 * stripping off last elements on queues.  Every
 @@ -544,13 +621,11 @@
  	 */
  	target = uma_zone_get_max(V_ipq_zone);
  	while (uma_zone_get_cur(V_ipq_zone) > target) {
 -		struct ipq *fp;
 -
  		for (int i = 0; i < IPREASS_NHASH; i++) {
  			IPQ_LOCK(i);
  			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
  			if (fp != NULL)
 -				ipq_timeout(&V_ipq[i].head, fp);
 +				ipq_timeout(&V_ipq[i], fp);
  			IPQ_UNLOCK(i);
  		}
  	}
 @@ -559,9 +634,20 @@
  static void
  ipreass_zone_change(void *tag)
  {
 +	VNET_ITERATOR_DECL(vnet_iter);
 +	int max;

 -	uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
 -	ipreass_drain_tomax();
 +	maxfrags = IP_MAXFRAGS;
 +	max = IP_MAXFRAGPACKETS;
 +	VNET_LIST_RLOCK_NOSLEEP();
 +	VNET_FOREACH(vnet_iter) {
 +		CURVNET_SET(vnet_iter);
 +		max = uma_zone_set_max(V_ipq_zone, max);
 +		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 +		ipreass_drain_tomax();
 +		CURVNET_RESTORE();
 +	}
 +	VNET_LIST_RUNLOCK_NOSLEEP();
  }

  /*
 @@ -589,6 +675,7 @@
  		 * and place an extreme upper bound.
  		 */
  		max = uma_zone_set_max(V_ipq_zone, max);
 +		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
  		ipreass_drain_tomax();
  		V_noreass = 0;
  	} else if (max == 0) {
 @@ -597,6 +684,7 @@
  	} else if (max == -1) {
  		V_noreass = 0;
  		uma_zone_set_max(V_ipq_zone, 0);
 +		V_ipreass_maxbucketsize = INT_MAX;
  	} else
  		return (EINVAL);
  	return (0);
 @@ -610,34 +698,36 @@
  ipq_reuse(int start)
  {
  	struct ipq *fp;
 -	int i;
 +	int bucket, i;

  	IPQ_LOCK_ASSERT(start);

 -	for (i = start;; i++) {
 -		if (i == IPREASS_NHASH)
 -			i = 0;
 -		if (i != start && IPQ_TRYLOCK(i) == 0)
 +	for (i = 0; i < IPREASS_NHASH; i++) {
 +		bucket = (start + i) % IPREASS_NHASH;
 +		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
  			continue;
 -		fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 +		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
  		if (fp) {
  			struct mbuf *m;

  			IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 +			atomic_subtract_int(&nfrags, fp->ipq_nfrags);
  			while (fp->ipq_frags) {
  				m = fp->ipq_frags;
  				fp->ipq_frags = m->m_nextpkt;
  				m_freem(m);
  			}
 -			TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list);
 -			if (i != start)
 -				IPQ_UNLOCK(i);
 -			IPQ_LOCK_ASSERT(start);
 -			return (fp);
 +			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
 +			V_ipq[bucket].count--;
 +			if (bucket != start)
 +				IPQ_UNLOCK(bucket);
 +			break;
  		}
 -		if (i != start)
 -			IPQ_UNLOCK(i);
 +		if (bucket != start)
 +			IPQ_UNLOCK(bucket);
  	}
 +	IPQ_LOCK_ASSERT(start);
 +	return (fp);
  }

  /*
 @@ -644,15 +734,36 @@
   * Free a fragment reassembly header and all associated datagrams.
   */
  static void
 -ipq_free(struct ipqhead *fhp, struct ipq *fp)
 +ipq_free(struct ipqbucket *bucket, struct ipq *fp)
  {
  	struct mbuf *q;

 +	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
  	while (fp->ipq_frags) {
  		q = fp->ipq_frags;
  		fp->ipq_frags = q->m_nextpkt;
  		m_freem(q);
  	}
 -	TAILQ_REMOVE(fhp, fp, ipq_list);
 +	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
 +	bucket->count--;
  	uma_zfree(V_ipq_zone, fp);
  }
 +
 +/*
 + * Get or set the maximum number of reassembly queues per bucket.
 + */
 +static int
 +sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
 +{
 +	int error, max;
 +
 +	max = V_ipreass_maxbucketsize;
 +	error = sysctl_handle_int(oidp, &max, 0, req);
 +	if (error || !req->newptr)
 +		return (error);
 +	if (max <= 0)
 +		return (EINVAL);
 +	V_ipreass_maxbucketsize = max;
 +	ipreass_drain_tomax();
 +	return (0);
 +}
 Index: sys/netinet6/frag6.c
 ===================================================================
 --- sys/netinet6/frag6.c	(版本 337395)
 +++ sys/netinet6/frag6.c	(版本 337828)
 @@ -36,6 +36,7 @@

  #include <sys/param.h>
  #include <sys/systm.h>
 +#include <sys/hash.h>
  #include <sys/malloc.h>
  #include <sys/mbuf.h>
  #include <sys/domain.h>
 @@ -47,6 +48,8 @@
  #include <sys/kernel.h>
  #include <sys/syslog.h>

 +#include <machine/atomic.h>
 +
  #include <net/if.h>
  #include <net/if_var.h>
  #include <net/netisr.h>
 @@ -63,58 +66,110 @@

  #include <security/mac/mac_framework.h>

 -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *);
 -static void frag6_deq(struct ip6asfrag *);
 -static void frag6_insque(struct ip6q *, struct ip6q *);
 -static void frag6_remque(struct ip6q *);
 -static void frag6_freef(struct ip6q *);
 -
 -static struct mtx ip6qlock;
  /*
 - * These fields all protected by ip6qlock.
 + * Reassembly headers are stored in hash buckets.
   */
 -static VNET_DEFINE(u_int, frag6_nfragpackets);
 -static VNET_DEFINE(u_int, frag6_nfrags);
 -static VNET_DEFINE(struct ip6q, ip6q);	/* ip6 reassemble queue */
 +#define	IP6REASS_NHASH_LOG2	10
 +#define	IP6REASS_NHASH		(1 << IP6REASS_NHASH_LOG2)
 +#define	IP6REASS_HMASK		(IP6REASS_NHASH - 1)

 +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
 +    uint32_t bucket __unused);
 +static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
 +static void frag6_insque_head(struct ip6q *, struct ip6q *,
 +    uint32_t bucket);
 +static void frag6_remque(struct ip6q *, uint32_t bucket);
 +static void frag6_freef(struct ip6q *, uint32_t bucket);
 +
 +struct ip6qbucket {
 +	struct ip6q	ip6q;
 +	struct mtx	lock;
 +	int		count;
 +};
 +
 +static VNET_DEFINE(volatile u_int, frag6_nfragpackets);
 +volatile u_int frag6_nfrags = 0;
 +static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]);
 +static VNET_DEFINE(uint32_t, ip6q_hashseed);
 +
  #define	V_frag6_nfragpackets		VNET(frag6_nfragpackets)
 -#define	V_frag6_nfrags			VNET(frag6_nfrags)
  #define	V_ip6q				VNET(ip6q)
 +#define	V_ip6q_hashseed			VNET(ip6q_hashseed)

 -#define	IP6Q_LOCK_INIT()	mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF);
 -#define	IP6Q_LOCK()		mtx_lock(&ip6qlock)
 -#define	IP6Q_TRYLOCK()		mtx_trylock(&ip6qlock)
 -#define	IP6Q_LOCK_ASSERT()	mtx_assert(&ip6qlock, MA_OWNED)
 -#define	IP6Q_UNLOCK()		mtx_unlock(&ip6qlock)
 +#define	IP6Q_LOCK(i)		mtx_lock(&V_ip6q[(i)].lock)
 +#define	IP6Q_TRYLOCK(i)		mtx_trylock(&V_ip6q[(i)].lock)
 +#define	IP6Q_LOCK_ASSERT(i)	mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
 +#define	IP6Q_UNLOCK(i)		mtx_unlock(&V_ip6q[(i)].lock)
 +#define	IP6Q_HEAD(i)		(&V_ip6q[(i)].ip6q)

  static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");

  /*
 + * By default, limit the number of IP6 fragments across all reassembly
 + * queues to  1/32 of the total number of mbuf clusters.
 + *
 + * Limit the total number of reassembly queues per VNET to the
 + * IP6 fragment limit, but ensure the limit will not allow any bucket
 + * to grow above 100 items. (The bucket limit is
 + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
 + * multiplier to reach a 100-item limit.)
 + * The 100-item limit was chosen as brief testing seems to show that
 + * this produces "reasonable" performance on some subset of systems
 + * under DoS attack.
 + */
 +#define	IP6_MAXFRAGS		(nmbclusters / 32)
 +#define	IP6_MAXFRAGPACKETS	(imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
 +
 +/*
   * Initialise reassembly queue and fragment identifier.
   */
 +void
 +frag6_set_bucketsize()
 +{
 +	int i;
 +
 +	if ((i = V_ip6_maxfragpackets) > 0)
 +		V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
 +}
 +
  static void
  frag6_change(void *tag)
  {
 +	VNET_ITERATOR_DECL(vnet_iter);

 -	V_ip6_maxfragpackets = nmbclusters / 4;
 -	V_ip6_maxfrags = nmbclusters / 4;
 +	ip6_maxfrags = IP6_MAXFRAGS;
 +	VNET_LIST_RLOCK_NOSLEEP();
 +	VNET_FOREACH(vnet_iter) {
 +		CURVNET_SET(vnet_iter);
 +		V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 +		frag6_set_bucketsize();
 +		CURVNET_RESTORE();
 +	}
 +	VNET_LIST_RUNLOCK_NOSLEEP();
  }

  void
  frag6_init(void)
  {
 +	struct ip6q *q6;
 +	int i;

 -	V_ip6_maxfragpackets = nmbclusters / 4;
 -	V_ip6_maxfrags = nmbclusters / 4;
 -	V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
 -
 +	V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 +	frag6_set_bucketsize();
 +	for (i = 0; i < IP6REASS_NHASH; i++) {
 +		q6 = IP6Q_HEAD(i);
 +		q6->ip6q_next = q6->ip6q_prev = q6;
 +		mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
 +		V_ip6q[i].count = 0;
 +	}
 +	V_ip6q_hashseed = arc4random();
 +	V_ip6_maxfragsperpacket = 64;
  	if (!IS_DEFAULT_VNET(curvnet))
  		return;

 +	ip6_maxfrags = IP6_MAXFRAGS;
  	EVENTHANDLER_REGISTER(nmbclusters_change,
  	    frag6_change, NULL, EVENTHANDLER_PRI_ANY);
 -
 -	IP6Q_LOCK_INIT();
  }

  /*
 @@ -155,12 +210,13 @@
  	struct mbuf *m = *mp, *t;
  	struct ip6_hdr *ip6;
  	struct ip6_frag *ip6f;
 -	struct ip6q *q6;
 +	struct ip6q *head, *q6;
  	struct ip6asfrag *af6, *ip6af, *af6dwn;
  	struct in6_ifaddr *ia;
  	int offset = *offp, nxt, i, next;
  	int first_frag = 0;
  	int fragoff, frgpartlen;	/* must be larger than u_int16_t */
 +	uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp;
  	struct ifnet *dstifp;
  	u_int8_t ecn, ecn0;
  #ifdef RSS
 @@ -228,19 +284,38 @@
  		return (ip6f->ip6f_nxt);
  	}

 -	IP6Q_LOCK();
 +	/* Get fragment length and discard 0-byte fragments. */
 +	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
 +	if (frgpartlen == 0) {
 +		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 +		    offsetof(struct ip6_hdr, ip6_plen));
 +		in6_ifstat_inc(dstifp, ifs6_reass_fail);
 +		IP6STAT_INC(ip6s_fragdropped);
 +		return IPPROTO_DONE;
 +	}

 +	hashkeyp = hashkey;
 +	memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
 +	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 +	memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
 +	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 +	*hashkeyp = ip6f->ip6f_ident;
 +	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
 +	hash &= IP6REASS_HMASK;
 +	head = IP6Q_HEAD(hash);
 +	IP6Q_LOCK(hash);
 +
  	/*
  	 * Enforce upper bound on number of fragments.
  	 * If maxfrag is 0, never accept fragments.
  	 * If maxfrag is -1, accept all fragments without limitation.
  	 */
 -	if (V_ip6_maxfrags < 0)
 +	if (ip6_maxfrags < 0)
  		;
 -	else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags)
 +	else if (frag6_nfrags >= (u_int)ip6_maxfrags)
  		goto dropfrag;

 -	for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next)
 +	for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
  		if (ip6f->ip6f_ident == q6->ip6q_ident &&
  		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
  		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
 @@ -250,7 +325,7 @@
  		    )
  			break;

 -	if (q6 == &V_ip6q) {
 +	if (q6 == head) {
  		/*
  		 * the first fragment to arrive, create a reassembly queue.
  		 */
 @@ -265,9 +340,10 @@
  		 */
  		if (V_ip6_maxfragpackets < 0)
  			;
 -		else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
 +		else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize ||
 +		    V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
  			goto dropfrag;
 -		V_frag6_nfragpackets++;
 +		atomic_add_int(&V_frag6_nfragpackets, 1);
  		q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
  		    M_NOWAIT);
  		if (q6 == NULL)
 @@ -280,7 +356,7 @@
  		}
  		mac_ip6q_create(m, q6);
  #endif
 -		frag6_insque(q6, &V_ip6q);
 +		frag6_insque_head(q6, head, hash);

  		/* ip6q_nxt will be filled afterwards, from 1st fragment */
  		q6->ip6q_down	= q6->ip6q_up = (struct ip6asfrag *)q6;
 @@ -314,7 +390,6 @@
  	 * in size.
  	 * If it would exceed, discard the fragment and return an ICMP error.
  	 */
 -	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
  	if (q6->ip6q_unfrglen >= 0) {
  		/* The 1st fragment has already arrived. */
  		if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
 @@ -321,7 +396,7 @@
  			icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
  			    offset - sizeof(struct ip6_frag) +
  			    offsetof(struct ip6_frag, ip6f_offlg));
 -			IP6Q_UNLOCK();
 +			IP6Q_UNLOCK(hash);
  			return (IPPROTO_DONE);
  		}
  	} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
 @@ -328,7 +403,7 @@
  		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
  		    offset - sizeof(struct ip6_frag) +
  		    offsetof(struct ip6_frag, ip6f_offlg));
 -		IP6Q_UNLOCK();
 +		IP6Q_UNLOCK(hash);
  		return (IPPROTO_DONE);
  	}
  	/*
 @@ -347,7 +422,7 @@
  				int erroff = af6->ip6af_offset;

  				/* dequeue the fragment. */
 -				frag6_deq(af6);
 +				frag6_deq(af6, hash);
  				free(af6, M_FTABLE);

  				/* adjust pointer. */
 @@ -445,7 +520,7 @@
  		}
  		af6 = af6->ip6af_down;
  		m_freem(IP6_REASS_MBUF(af6->ip6af_up));
 -		frag6_deq(af6->ip6af_up);
 +		frag6_deq(af6->ip6af_up, hash);
  	}
  #else
  	/*
 @@ -494,16 +569,17 @@
  	/*
  	 * Stick new segment in its place;
  	 * check for complete reassembly.
 +	 * If not complete, check fragment limit.
  	 * Move to front of packet queue, as we are
  	 * the most recently active fragmented packet.
  	 */
 -	frag6_enq(ip6af, af6->ip6af_up);
 -	V_frag6_nfrags++;
 +	frag6_enq(ip6af, af6->ip6af_up, hash);
 +	atomic_add_int(&frag6_nfrags, 1);
  	q6->ip6q_nfrag++;
  #if 0 /* xxx */
 -	if (q6 != V_ip6q.ip6q_next) {
 -		frag6_remque(q6);
 -		frag6_insque(q6, &V_ip6q);
 +	if (q6 != head->ip6q_next) {
 +		frag6_remque(q6, hash);
 +		frag6_insque_head(q6, head, hash);
  	}
  #endif
  	next = 0;
 @@ -510,13 +586,21 @@
  	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
  	     af6 = af6->ip6af_down) {
  		if (af6->ip6af_off != next) {
 -			IP6Q_UNLOCK();
 +			if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 +				IP6STAT_INC(ip6s_fragdropped);
 +				frag6_freef(q6, hash);
 +			}
 +			IP6Q_UNLOCK(hash);
  			return IPPROTO_DONE;
  		}
  		next += af6->ip6af_frglen;
  	}
  	if (af6->ip6af_up->ip6af_mff) {
 -		IP6Q_UNLOCK();
 +		if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 +			IP6STAT_INC(ip6s_fragdropped);
 +			frag6_freef(q6, hash);
 +		}
 +		IP6Q_UNLOCK(hash);
  		return IPPROTO_DONE;
  	}

 @@ -526,7 +610,7 @@
  	ip6af = q6->ip6q_down;
  	t = m = IP6_REASS_MBUF(ip6af);
  	af6 = ip6af->ip6af_down;
 -	frag6_deq(ip6af);
 +	frag6_deq(ip6af, hash);
  	while (af6 != (struct ip6asfrag *)q6) {
  		m->m_pkthdr.csum_flags &=
  		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags;
 @@ -534,7 +618,7 @@
  		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_data;

  		af6dwn = af6->ip6af_down;
 -		frag6_deq(af6);
 +		frag6_deq(af6, hash);
  		while (t->m_next)
  			t = t->m_next;
  		m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
 @@ -560,13 +644,13 @@
  #endif

  	if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
 -		frag6_remque(q6);
 -		V_frag6_nfrags -= q6->ip6q_nfrag;
 +		frag6_remque(q6, hash);
 +		atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
  #ifdef MAC
  		mac_ip6q_destroy(q6);
  #endif
  		free(q6, M_FTABLE);
 -		V_frag6_nfragpackets--;
 +		atomic_subtract_int(&V_frag6_nfragpackets, 1);

  		goto dropfrag;
  	}
 @@ -579,14 +663,14 @@
  		*prvnxtp = nxt;
  	}

 -	frag6_remque(q6);
 -	V_frag6_nfrags -= q6->ip6q_nfrag;
 +	frag6_remque(q6, hash);
 +	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
  #ifdef MAC
  	mac_ip6q_reassemble(q6, m);
  	mac_ip6q_destroy(q6);
  #endif
  	free(q6, M_FTABLE);
 -	V_frag6_nfragpackets--;
 +	atomic_subtract_int(&V_frag6_nfragpackets, 1);

  	if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
  		int plen = 0;
 @@ -608,7 +692,7 @@
  	m_tag_prepend(m, mtag);
  #endif

 -	IP6Q_UNLOCK();
 +	IP6Q_UNLOCK(hash);
  	IP6STAT_INC(ip6s_reassembled);
  	in6_ifstat_inc(dstifp, ifs6_reass_ok);

 @@ -630,7 +714,7 @@
  	return nxt;

   dropfrag:
 -	IP6Q_UNLOCK();
 +	IP6Q_UNLOCK(hash);
  	in6_ifstat_inc(dstifp, ifs6_reass_fail);
  	IP6STAT_INC(ip6s_fragdropped);
  	m_freem(m);
 @@ -641,12 +725,12 @@
   * Free a fragment reassembly header and all
   * associated datagrams.
   */
 -void
 -frag6_freef(struct ip6q *q6)
 +static void
 +frag6_freef(struct ip6q *q6, uint32_t bucket)
  {
  	struct ip6asfrag *af6, *down6;

 -	IP6Q_LOCK_ASSERT();
 +	IP6Q_LOCK_ASSERT(bucket);

  	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
  	     af6 = down6) {
 @@ -653,7 +737,7 @@
  		struct mbuf *m = IP6_REASS_MBUF(af6);

  		down6 = af6->ip6af_down;
 -		frag6_deq(af6);
 +		frag6_deq(af6, bucket);

  		/*
  		 * Return ICMP time exceeded error for the 1st fragment.
 @@ -675,13 +759,13 @@
  			m_freem(m);
  		free(af6, M_FTABLE);
  	}
 -	frag6_remque(q6);
 -	V_frag6_nfrags -= q6->ip6q_nfrag;
 +	frag6_remque(q6, bucket);
 +	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
  #ifdef MAC
  	mac_ip6q_destroy(q6);
  #endif
  	free(q6, M_FTABLE);
 -	V_frag6_nfragpackets--;
 +	atomic_subtract_int(&V_frag6_nfragpackets, 1);
  }

  /*
 @@ -688,11 +772,12 @@
   * Put an ip fragment on a reassembly chain.
   * Like insque, but pointers in middle of structure.
   */
 -void
 -frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
 +static void
 +frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
 +    uint32_t bucket __unused)
  {

 -	IP6Q_LOCK_ASSERT();
 +	IP6Q_LOCK_ASSERT(bucket);

  	af6->ip6af_up = up6;
  	af6->ip6af_down = up6->ip6af_down;
 @@ -703,36 +788,41 @@
  /*
   * To frag6_enq as remque is to insque.
   */
 -void
 -frag6_deq(struct ip6asfrag *af6)
 +static void
 +frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
  {

 -	IP6Q_LOCK_ASSERT();
 +	IP6Q_LOCK_ASSERT(bucket);

  	af6->ip6af_up->ip6af_down = af6->ip6af_down;
  	af6->ip6af_down->ip6af_up = af6->ip6af_up;
  }

 -void
 -frag6_insque(struct ip6q *new, struct ip6q *old)
 +static void
 +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket)
  {

 -	IP6Q_LOCK_ASSERT();
 +	IP6Q_LOCK_ASSERT(bucket);
 +	KASSERT(IP6Q_HEAD(bucket) == old,
 +	    ("%s: attempt to insert at head of wrong bucket"
 +	    " (bucket=%u, old=%p)", __func__, bucket, old));

  	new->ip6q_prev = old;
  	new->ip6q_next = old->ip6q_next;
  	old->ip6q_next->ip6q_prev= new;
  	old->ip6q_next = new;
 +	V_ip6q[bucket].count++;
  }

 -void
 -frag6_remque(struct ip6q *p6)
 +static void
 +frag6_remque(struct ip6q *p6, uint32_t bucket)
  {

 -	IP6Q_LOCK_ASSERT();
 +	IP6Q_LOCK_ASSERT(bucket);

  	p6->ip6q_prev->ip6q_next = p6->ip6q_next;
  	p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
 +	V_ip6q[bucket].count--;
  }

  /*
 @@ -744,37 +834,71 @@
  frag6_slowtimo(void)
  {
  	VNET_ITERATOR_DECL(vnet_iter);
 -	struct ip6q *q6;
 +	struct ip6q *head, *q6;
 +	int i;

  	VNET_LIST_RLOCK_NOSLEEP();
 -	IP6Q_LOCK();
  	VNET_FOREACH(vnet_iter) {
  		CURVNET_SET(vnet_iter);
 -		q6 = V_ip6q.ip6q_next;
 -		if (q6)
 -			while (q6 != &V_ip6q) {
 +		for (i = 0; i < IP6REASS_NHASH; i++) {
 +			IP6Q_LOCK(i);
 +			head = IP6Q_HEAD(i);
 +			q6 = head->ip6q_next;
 +			if (q6 == NULL) {
 +				/*
 +				 * XXXJTL: This should never happen. This
 +				 * should turn into an assertion.
 +				 */
 +				IP6Q_UNLOCK(i);
 +				continue;
 +			}
 +			while (q6 != head) {
  				--q6->ip6q_ttl;
  				q6 = q6->ip6q_next;
  				if (q6->ip6q_prev->ip6q_ttl == 0) {
  					IP6STAT_INC(ip6s_fragtimeout);
  					/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 -					frag6_freef(q6->ip6q_prev);
 +					frag6_freef(q6->ip6q_prev, i);
  				}
  			}
 +			/*
 +			 * If we are over the maximum number of fragments
 +			 * (due to the limit being lowered), drain off
 +			 * enough to get down to the new limit.
 +			 * Note that we drain all reassembly queues if
 +			 * maxfragpackets is 0 (fragmentation is disabled),
 +			 * and don't enforce a limit when maxfragpackets
 +			 * is negative.
 +			 */
 +			while ((V_ip6_maxfragpackets == 0 ||
 +			    (V_ip6_maxfragpackets > 0 &&
 +			    V_ip6q[i].count > V_ip6_maxfragbucketsize)) &&
 +			    head->ip6q_prev != head) {
 +				IP6STAT_INC(ip6s_fragoverflow);
 +				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 +				frag6_freef(head->ip6q_prev, i);
 +			}
 +			IP6Q_UNLOCK(i);
 +		}
  		/*
 -		 * If we are over the maximum number of fragments
 -		 * (due to the limit being lowered), drain off
 -		 * enough to get down to the new limit.
 +		 * If we are still over the maximum number of fragmented
 +		 * packets, drain off enough to get down to the new limit.
  		 */
 -		while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
 -		    V_ip6q.ip6q_prev) {
 -			IP6STAT_INC(ip6s_fragoverflow);
 -			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 -			frag6_freef(V_ip6q.ip6q_prev);
 +		i = 0;
 +		while (V_ip6_maxfragpackets >= 0 &&
 +		    V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets) {
 +			IP6Q_LOCK(i);
 +			head = IP6Q_HEAD(i);
 +			if (head->ip6q_prev != head) {
 +				IP6STAT_INC(ip6s_fragoverflow);
 +				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 +				frag6_freef(head->ip6q_prev, i);
 +			}
 +			IP6Q_UNLOCK(i);
 +			i = (i + 1) % IP6REASS_NHASH;
  		}
  		CURVNET_RESTORE();
  	}
 -	IP6Q_UNLOCK();
  	VNET_LIST_RUNLOCK_NOSLEEP();
  }

 @@ -785,22 +909,25 @@
  frag6_drain(void)
  {
  	VNET_ITERATOR_DECL(vnet_iter);
 +	struct ip6q *head;
 +	int i;

  	VNET_LIST_RLOCK_NOSLEEP();
 -	if (IP6Q_TRYLOCK() == 0) {
 -		VNET_LIST_RUNLOCK_NOSLEEP();
 -		return;
 -	}
  	VNET_FOREACH(vnet_iter) {
  		CURVNET_SET(vnet_iter);
 -		while (V_ip6q.ip6q_next != &V_ip6q) {
 -			IP6STAT_INC(ip6s_fragdropped);
 -			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 -			frag6_freef(V_ip6q.ip6q_next);
 +		for (i = 0; i < IP6REASS_NHASH; i++) {
 +			if (IP6Q_TRYLOCK(i) == 0)
 +				continue;
 +			head = IP6Q_HEAD(i);
 +			while (head->ip6q_next != head) {
 +				IP6STAT_INC(ip6s_fragdropped);
 +				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 +				frag6_freef(head->ip6q_next, i);
 +			}
 +			IP6Q_UNLOCK(i);
  		}
  		CURVNET_RESTORE();
  	}
 -	IP6Q_UNLOCK();
  	VNET_LIST_RUNLOCK_NOSLEEP();
  }

 Index: sys/netinet6/in6.h
 ===================================================================
 --- sys/netinet6/in6.h	(版本 337395)
 +++ sys/netinet6/in6.h	(版本 337828)
 @@ -637,7 +637,9 @@
  #define	IPV6CTL_INTRQMAXLEN	51	/* max length of IPv6 netisr queue */
  #define	IPV6CTL_INTRDQMAXLEN	52	/* max length of direct IPv6 netisr
  					 * queue */
 -#define	IPV6CTL_MAXID		53
 +#define	IPV6CTL_MAXFRAGSPERPACKET	53 /* Max fragments per packet */
 +#define	IPV6CTL_MAXFRAGBUCKETSIZE	54 /* Max reassembly queues per bucket */
 +#define	IPV6CTL_MAXID		55
  #endif /* __BSD_VISIBLE */

  /*
 Index: sys/netinet6/in6_proto.c
 ===================================================================
 --- sys/netinet6/in6_proto.c	(版本 337395)
 +++ sys/netinet6/in6_proto.c	(版本 337828)
 @@ -386,7 +386,9 @@
  VNET_DEFINE(int, ip6_norbit_raif) = 0;
  VNET_DEFINE(int, ip6_rfc6204w3) = 0;
  VNET_DEFINE(int, ip6_maxfragpackets);	/* initialized in frag6.c:frag6_init() */
 -VNET_DEFINE(int, ip6_maxfrags);		/* initialized in frag6.c:frag6_init() */
 +int ip6_maxfrags;		/* initialized in frag6.c:frag6_init() */
 +VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */
 +VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */
  VNET_DEFINE(int, ip6_log_interval) = 5;
  VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
  					 * process? */
 @@ -473,6 +475,20 @@
  	return (0);
  }

 +static int
 +sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
 +{
 +	int error, val;
 +
 +	val = V_ip6_maxfragpackets;
 +	error = sysctl_handle_int(oidp, &val, 0, req);
 +	if (error != 0 || !req->newptr)
 +		return (error);
 +	V_ip6_maxfragpackets = val;
 +	frag6_set_bucketsize();
 +	return (0);
 +}
 +
  SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
  	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0,
  	"Enable forwarding of IPv6 packets between interfaces");
 @@ -485,8 +501,9 @@
  SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
  	ip6stat,
  	"IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
 -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
 -	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0,
 +SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
 +	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
 +	sysctl_ip6_maxfragpackets, "I",
  	"Default maximum number of outstanding fragmented IPv6 packets. "
  	"A value of 0 means no fragmented packets will be accepted, while a "
  	"a value of -1 means no limit");
 @@ -560,8 +577,16 @@
  	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,
  	"Use the default scope zone when none is specified");
  SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
 -	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0,
 -	"Maximum allowed number of outstanding IPv6 packet fragments");
 +	CTLFLAG_RW, &ip6_maxfrags, 0,
 +	"Maximum allowed number of outstanding IPv6 packet fragments. "
 +	"A value of 0 means no fragmented packets will be accepted, while a "
 +	"a value of -1 means no limit");
 +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
 +	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
 +	"Maximum number of reassembly queues per hash bucket");
 +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
 +	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
 +	"Maximum allowed number of fragments per packet");
  SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
  	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0,
  	"Enable path MTU discovery for multicast packets");
 Index: sys/netinet6/ip6_var.h
 ===================================================================
 --- sys/netinet6/ip6_var.h	(版本 337395)
 +++ sys/netinet6/ip6_var.h	(版本 337828)
 @@ -296,8 +296,10 @@
  VNET_DECLARE(int, ip6_sendredirects);	/* send IP redirects when forwarding? */
  VNET_DECLARE(int, ip6_maxfragpackets);	/* Maximum packets in reassembly
  					 * queue */
 -VNET_DECLARE(int, ip6_maxfrags);	/* Maximum fragments in reassembly
 +extern int ip6_maxfrags;		/* Maximum fragments in reassembly
  					 * queue */
 +VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */
 +VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */
  VNET_DECLARE(int, ip6_accept_rtadv);	/* Acts as a host not a router */
  VNET_DECLARE(int, ip6_no_radr);		/* No defroute from RA */
  VNET_DECLARE(int, ip6_norbit_raif);	/* Disable R-bit in NA on RA
 @@ -312,7 +314,8 @@
  #define	V_ip6_mrouter			VNET(ip6_mrouter)
  #define	V_ip6_sendredirects		VNET(ip6_sendredirects)
  #define	V_ip6_maxfragpackets		VNET(ip6_maxfragpackets)
 -#define	V_ip6_maxfrags			VNET(ip6_maxfrags)
 +#define	V_ip6_maxfragbucketsize		VNET(ip6_maxfragbucketsize)
 +#define	V_ip6_maxfragsperpacket		VNET(ip6_maxfragsperpacket)
  #define	V_ip6_accept_rtadv		VNET(ip6_accept_rtadv)
  #define	V_ip6_no_radr			VNET(ip6_no_radr)
  #define	V_ip6_norbit_raif		VNET(ip6_norbit_raif)
 @@ -399,6 +402,7 @@

  int	route6_input(struct mbuf **, int *, int);

 +void	frag6_set_bucketsize(void);
  void	frag6_init(void);
  int	frag6_input(struct mbuf **, int *, int);
  void	frag6_slowtimo(void);
 Index: sys/vm/vm_page.c
 ===================================================================
 --- sys/vm/vm_page.c	(版本 337395)
 +++ sys/vm/vm_page.c	(版本 337828)
 @@ -290,6 +290,27 @@
  	return (0);
  }

 +bool
 +vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
 +{
 +	vm_page_t m;
 +	int ret;
 +
 +	m = vm_phys_paddr_to_vm_page(pa);
 +	if (m == NULL)
 +		return (true); /* page does not exist, no failure */
 +
 +	mtx_lock(&vm_page_queue_free_mtx);
 +	ret = vm_phys_unfree_page(m);
 +	mtx_unlock(&vm_page_queue_free_mtx);
 +	if (ret) {
 +		TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
 +		if (verbose)
 +			printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
 +	}
 +	return (ret);
 +}
 +
  /*
   *	vm_page_blacklist_check:
   *
 @@ -301,26 +322,13 @@
  vm_page_blacklist_check(char *list, char *end)
  {
  	vm_paddr_t pa;
 -	vm_page_t m;
  	char *next;
 -	int ret;

  	next = list;
  	while (next != NULL) {
  		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
  			continue;
 -		m = vm_phys_paddr_to_vm_page(pa);
 -		if (m == NULL)
 -			continue;
 -		mtx_lock(&vm_page_queue_free_mtx);
 -		ret = vm_phys_unfree_page(m);
 -		mtx_unlock(&vm_page_queue_free_mtx);
 -		if (ret == TRUE) {
 -			TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
 -			if (bootverbose)
 -				printf("Skipping page with pa 0x%jx\n",
 -				    (uintmax_t)pa);
 -		}
 +		vm_page_blacklist_add(pa, bootverbose);
  	}
  }

 Index: sys/vm/vm_page.h
 ===================================================================
 --- sys/vm/vm_page.h	(版本 337395)
 +++ sys/vm/vm_page.h	(版本 337828)
 @@ -448,6 +448,7 @@
      u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
      vm_paddr_t boundary, vm_memattr_t memattr);
  vm_page_t vm_page_alloc_freelist(int, int);
 +bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose);
  vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
  int vm_page_try_to_free (vm_page_t);
  void vm_page_deactivate (vm_page_t);
 Index: sys/x86/include/specialreg.h
 ===================================================================
 --- sys/x86/include/specialreg.h	(版本 337395)
 +++ sys/x86/include/specialreg.h	(版本 337828)
 @@ -378,6 +378,7 @@
   */
  #define	CPUID_STDEXT3_IBPB	0x04000000
  #define	CPUID_STDEXT3_STIBP	0x08000000
 +#define	CPUID_STDEXT3_L1D_FLUSH	0x10000000
  #define	CPUID_STDEXT3_ARCH_CAP	0x20000000

  /* MSR IA32_ARCH_CAP(ABILITIES) bits */
 @@ -427,6 +428,7 @@
  #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
  #define	MSR_MTRRcap		0x0fe
  #define	MSR_IA32_ARCH_CAP	0x10a
 +#define	MSR_IA32_FLUSH_CMD	0x10b
  #define	MSR_BBL_CR_ADDR		0x116
  #define	MSR_BBL_CR_DECC		0x118
  #define	MSR_BBL_CR_CTL		0x119
 @@ -580,6 +582,9 @@
  /* MSR IA32_PRED_CMD */
  #define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL

 +/* MSR IA32_FLUSH_CMD */
 +#define	IA32_FLUSH_CMD_L1D	0x00000001
 +
  /*
   * PAT modes.
   */