From 9fb832ce382d649b7687426e6bc4e5d3715cb78a Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Sun, 19 Jul 2020 20:40:10 -0500
Subject: [PATCH] vect: Support length-based partial vectors approach

Power9 supports vector load/store instruction lxvl/stxvl which allow
us to operate partial vectors with one specific length.  This patch
extends some of current mask-based partial vectors support code for
length-based approach, also adds some length specific support code.
So far it assumes that we can only have one partial vectors approach
at the same time, it will disable to use partial vectors if both
approaches co-exist.

Like the description of optab len_load/len_store, the length-based
approach can have two flavors, one is length in bytes, the other is
length in lanes.  This patch is mainly implemented and tested for
length in bytes, but as Richard S. suggested, most of code has
considered both flavors.

This also introduces one parameter vect-partial-vector-usage allow
users to control when the loop vectorizer considers using partial
vectors as an alternative to falling back to scalar code.

gcc/ChangeLog:

	* config/rs6000/rs6000.c (rs6000_option_override_internal):
	Set param_vect_partial_vector_usage to 0 explicitly.
	* doc/invoke.texi (vect-partial-vector-usage): Document new option.
	* optabs-query.c (get_len_load_store_mode): New function.
	* optabs-query.h (get_len_load_store_mode): New declare.
	* params.opt (vect-partial-vector-usage): New.
	* tree-vect-loop-manip.c (vect_set_loop_controls_directly): Add the
	handlings for vectorization using length-based partial vectors, call
	vect_gen_len for length generation, and rename some variables with
	items instead of scalars.
	(vect_set_loop_condition_partial_vectors): Add the handlings for
	vectorization using length-based partial vectors.
	(vect_do_peeling): Allow remaining eiters less than epilogue vf for
	LOOP_VINFO_USING_PARTIAL_VECTORS_P.
	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Init
	epil_using_partial_vectors_p.
	(_loop_vec_info::~_loop_vec_info): Call release_vec_loop_controls
	for lengths destruction.
	(vect_verify_loop_lens): New function.
	(vect_analyze_loop): Add handlings for epilogue of loop when it's
	marked to use vectorization using partial vectors.
	(vect_analyze_loop_2): Add the check to allow only one vectorization
	approach using partial vectorization at the same time.  Check param
	vect-partial-vector-usage for partial vectors decision.  Mark
	LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P if the epilogue is
	considerable to use partial vectors.  Call release_vec_loop_controls
	for lengths destruction.
	(vect_estimate_min_profitable_iters): Adjust for loop vectorization
	using length-based partial vectors.
	(vect_record_loop_mask): Init factor to 1 for vectorization using
	mask-based partial vectors.
	(vect_record_loop_len): New function.
	(vect_get_loop_len): Likewise.
	* tree-vect-stmts.c (check_load_store_for_partial_vectors): Add
	checks for vectorization using length-based partial vectors.  Factor
	some code to lambda function get_valid_nvectors.
	(vectorizable_store): Add handlings when using length-based partial
	vectors.
	(vectorizable_load): Likewise.
	(vect_gen_len): New function.
	* tree-vectorizer.h (struct rgroup_controls): Add field factor
	mainly for length-based partial vectors.
	(vec_loop_lens): New typedef.
	(_loop_vec_info): Add lens and epil_using_partial_vectors_p.
	(LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P): New macro.
	(LOOP_VINFO_LENS): Likewise.
	(LOOP_VINFO_FULLY_WITH_LENGTH_P): Likewise.
	(vect_record_loop_len): New declare.
	(vect_get_loop_len): Likewise.
	(vect_gen_len): Likewise.
---
 gcc/config/rs6000/rs6000.c |   5 +
 gcc/doc/invoke.texi        |   9 ++
 gcc/optabs-query.c         |  27 +++++
 gcc/optabs-query.h         |   1 +
 gcc/params.opt             |   4 +
 gcc/tree-vect-loop-manip.c | 182 ++++++++++++++++++-------------
 gcc/tree-vect-loop.c       | 217 ++++++++++++++++++++++++++++++++++++-
 gcc/tree-vect-stmts.c      | 167 +++++++++++++++++++++++++---
 gcc/tree-vectorizer.h      |  35 +++++-
 9 files changed, 552 insertions(+), 95 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index c5c3300c59c..141854e5d82 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4629,6 +4629,11 @@ rs6000_option_override_internal (bool global_init_p)
       SET_OPTION_IF_UNSET (&global_options, &global_options_set,
 			   param_max_completely_peeled_insns, 400);
 
+      /* Temporarily disable it for now since lxvl/stxvl on the default
+	 supported hardware Power9 has unexpected performance behaviors.  */
+      SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+			   param_vect_partial_vector_usage, 0);
+
       /* Use the 'model' -fsched-pressure algorithm by default.  */
       SET_OPTION_IF_UNSET (&global_options, &global_options_set,
 			   param_sched_pressure_algorithm,
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 825fd669a75..ba18e05fb1a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13448,6 +13448,15 @@ by the copy loop headers pass.
 @item vect-epilogues-nomask
 Enable loop epilogue vectorization using smaller vector size.
 
+@item vect-partial-vector-usage
+Controls when the loop vectorizer considers using partial vector loads
+and stores as an alternative to falling back to scalar code.  0 stops
+the vectorizer from ever using partial vector loads and stores.  1 allows
+partial vector loads and stores if vectorization removes the need for the
+code to iterate.  2 allows partial vector loads and stores in all loops.
+The parameter only has an effect on targets that support partial
+vector loads and stores.
+
 @item slp-max-insns-in-bb
 Maximum number of instructions in basic block to be
 considered for SLP vectorization.
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 215d68e4225..be241057e4f 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -606,6 +606,33 @@ can_vec_mask_load_store_p (machine_mode mode,
   return false;
 }
 
+/* If target supports vector load/store with length for vector mode MODE,
+   return the corresponding vector mode, otherwise return opt_machine_mode ().
+   There are two flavors for vector load/store with length, one is to measure
+   length with bytes, the other is to measure length with lanes.
+   As len_{load,store} optabs point out, for the flavor with bytes, we use
+   VnQI to wrap the other supportable same size vector modes.  */
+
+opt_machine_mode
+get_len_load_store_mode (machine_mode mode, bool is_load)
+{
+  optab op = is_load ? len_load_optab : len_store_optab;
+  gcc_assert (VECTOR_MODE_P (mode));
+
+  /* Check if length in lanes supported for this mode directly.  */
+  if (direct_optab_handler (op, mode))
+    return mode;
+
+  /* Check if length in bytes supported for same vector size VnQI.  */
+  machine_mode vmode;
+  poly_uint64 nunits = GET_MODE_SIZE (mode);
+  if (related_vector_mode (mode, QImode, nunits).exists (&vmode)
+      && direct_optab_handler (op, vmode))
+    return vmode;
+
+  return opt_machine_mode ();
+}
+
 /* Return true if there is a compare_and_swap pattern.  */
 
 bool
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 729e1fdfc81..603ea8cac0d 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -188,6 +188,7 @@ enum insn_code find_widening_optab_handler_and_mode (optab, machine_mode,
 						     machine_mode *);
 int can_mult_highpart_p (machine_mode, bool);
 bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
+opt_machine_mode get_len_load_store_mode (machine_mode, bool);
 bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
diff --git a/gcc/params.opt b/gcc/params.opt
index e29a44e7712..b36eee0e71f 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -972,4 +972,8 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f
 Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
 Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
 
+-param=vect-partial-vector-usage=
+Common Joined UInteger Var(param_vect_partial_vector_usage) Init(2) IntegerRange(0, 2) Param Optimization
+Controls how loop vectorizer uses partial vectors.  0 means never, 1 means only for loops whose need to iterate can be removed, 2 means for all loops.  The default value is 2.
+
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 169ea64fd9e..490e7befea3 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -399,19 +399,20 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 
    It is known that:
 
-     NITERS * RGC->max_nscalars_per_iter
+     NITERS * RGC->max_nscalars_per_iter * RGC->factor
 
    does not overflow.  However, MIGHT_WRAP_P says whether an induction
    variable that starts at 0 and has step:
 
-     VF * RGC->max_nscalars_per_iter
+     VF * RGC->max_nscalars_per_iter * RGC->factor
 
    might overflow before hitting a value above:
 
-     (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter
+     (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 
    This means that we cannot guarantee that such an induction variable
-   would ever hit a value that produces a set of all-false masks for RGC.  */
+   would ever hit a value that produces a set of all-false masks or zero
+   lengths for RGC.  */
 
 static tree
 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
@@ -422,40 +423,46 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 {
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
   tree ctrl_type = rgc->type;
-  unsigned int nscalars_per_iter = rgc->max_nscalars_per_iter;
-  poly_uint64 nscalars_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  tree length_limit = NULL_TREE;
+  /* For length, we need length_limit to ensure length in range.  */
+  if (!use_masks_p)
+    length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 
-  /* Calculate the maximum number of scalar values that the rgroup
+  /* Calculate the maximum number of item values that the rgroup
      handles in total, the number that it handles for each iteration
      of the vector loop, and the number that it should skip during the
      first iteration of the vector loop.  */
-  tree nscalars_total = niters;
-  tree nscalars_step = build_int_cst (iv_type, vf);
-  tree nscalars_skip = niters_skip;
-  if (nscalars_per_iter != 1)
+  tree nitems_total = niters;
+  tree nitems_step = build_int_cst (iv_type, vf);
+  tree nitems_skip = niters_skip;
+  if (nitems_per_iter != 1)
     {
       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 	 these multiplications don't overflow.  */
-      tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
-      tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
-      nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
-				     nscalars_total, compare_factor);
-      nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
-				    nscalars_step, iv_factor);
-      if (nscalars_skip)
-	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
-				      nscalars_skip, compare_factor);
-    }
-
-  /* Create an induction variable that counts the number of scalars
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+				   nitems_total, compare_factor);
+      nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
+				  nitems_step, iv_factor);
+      if (nitems_skip)
+	nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+				    nitems_skip, compare_factor);
+    }
+
+  /* Create an induction variable that counts the number of items
      processed.  */
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
+  create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 
   tree zero_index = build_int_cst (compare_type, 0);
@@ -466,70 +473,70 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
       /* In principle the loop should stop iterating once the incremented
 	 IV reaches a value greater than or equal to:
 
-	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
+	   NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 
 	 However, there's no guarantee that this addition doesn't overflow
 	 the comparison type, or that the IV hits a value above it before
 	 wrapping around.  We therefore adjust the limit down by one
 	 IV step:
 
-	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
-	   -[infinite-prec] NSCALARS_STEP
+	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
+	   -[infinite-prec] NITEMS_STEP
 
 	 and compare the IV against this limit _before_ incrementing it.
 	 Since the comparison type is unsigned, we actually want the
 	 subtraction to saturate at zero:
 
-	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
-	   -[sat] NSCALARS_STEP
+	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
+	   -[sat] NITEMS_STEP
 
-	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
+	 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 
-	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
+	   NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 
 	 where the rightmost subtraction can be done directly in
 	 COMPARE_TYPE.  */
       test_index = index_before_incr;
       tree adjust = gimple_convert (preheader_seq, compare_type,
-				    nscalars_step);
-      if (nscalars_skip)
+				    nitems_step);
+      if (nitems_skip)
 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
-			       adjust, nscalars_skip);
+			       adjust, nitems_skip);
       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
-				 nscalars_total, adjust);
+				 nitems_total, adjust);
       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 				 test_limit, adjust);
       test_gsi = &incr_gsi;
 
       /* Get a safe limit for the first iteration.  */
-      if (nscalars_skip)
+      if (nitems_skip)
 	{
-	  /* The first vector iteration can handle at most NSCALARS_STEP
-	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
-	     NSCALARS_SKIP to that cannot overflow.  */
+	  /* The first vector iteration can handle at most NITEMS_STEP
+	     items.  NITEMS_STEP <= CONST_LIMIT, and adding
+	     NITEMS_SKIP to that cannot overflow.  */
 	  tree const_limit = build_int_cst (compare_type,
 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-					    * nscalars_per_iter);
+					    * nitems_per_iter);
 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
-				      nscalars_total, const_limit);
+				      nitems_total, const_limit);
 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
-				      first_limit, nscalars_skip);
+				      first_limit, nitems_skip);
 	}
       else
 	/* For the first iteration it doesn't matter whether the IV hits
-	   a value above NSCALARS_TOTAL.  That only matters for the latch
+	   a value above NITEMS_TOTAL.  That only matters for the latch
 	   condition.  */
-	first_limit = nscalars_total;
+	first_limit = nitems_total;
     }
   else
     {
       /* Test the incremented IV, which will always hit a value above
 	 the bound before wrapping.  */
       test_index = index_after_incr;
-      test_limit = nscalars_total;
-      if (nscalars_skip)
+      test_limit = nitems_total;
+      if (nitems_skip)
 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
-				   test_limit, nscalars_skip);
+				   test_limit, nitems_skip);
       test_gsi = &loop_cond_gsi;
 
       first_limit = test_limit;
@@ -547,18 +554,17 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   unsigned int i;
   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
     {
-      /* Previous controls will cover BIAS scalars.  This control covers the
+      /* Previous controls will cover BIAS items.  This control covers the
 	 next batch.  */
-      poly_uint64 bias = nscalars_per_ctrl * i;
+      poly_uint64 bias = nitems_per_ctrl * i;
       tree bias_tree = build_int_cst (compare_type, bias);
-      gimple *tmp_stmt;
 
       /* See whether the first iteration of the vector loop is known
 	 to have a full control.  */
       poly_uint64 const_limit;
       bool first_iteration_full
 	= (poly_int_tree_p (first_limit, &const_limit)
-	   && known_ge (const_limit, (i + 1) * nscalars_per_ctrl));
+	   && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 
       /* Rather than have a new IV that starts at BIAS and goes up to
 	 TEST_LIMIT, prefer to use the same 0-based IV for each control
@@ -574,7 +580,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 					  bias_tree);
 	}
 
-      /* Create the initial control.  First include all scalars that
+      /* Create the initial control.  First include all items that
 	 are within the loop limit.  */
       tree init_ctrl = NULL_TREE;
       if (!first_iteration_full)
@@ -591,27 +597,38 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 	    }
 	  else
 	    {
-	      /* FIRST_LIMIT is the maximum number of scalars handled by the
+	      /* FIRST_LIMIT is the maximum number of items handled by the
 		 first iteration of the vector loop.  Test the portion
 		 associated with this control.  */
 	      start = bias_tree;
 	      end = first_limit;
 	    }
 
-	  init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
-	  tmp_stmt = vect_gen_while (init_ctrl, start, end);
-	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
+	  if (use_masks_p)
+	    {
+	      init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
+	      gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
+	      gimple_seq_add_stmt (preheader_seq, tmp_stmt);
+	    }
+	  else
+	    {
+	      init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
+	      gimple_seq seq = vect_gen_len (init_ctrl, start,
+					     end, length_limit);
+	      gimple_seq_add_seq (preheader_seq, seq);
+	    }
 	}
 
       /* Now AND out the bits that are within the number of skipped
-	 scalars.  */
+	 items.  */
       poly_uint64 const_skip;
-      if (nscalars_skip
-	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
+      if (nitems_skip
+	  && !(poly_int_tree_p (nitems_skip, &const_skip)
 	       && known_le (const_skip, bias)))
 	{
+	  gcc_assert (use_masks_p);
 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
-						    bias_tree, nscalars_skip);
+						    bias_tree, nitems_skip);
 	  if (init_ctrl)
 	    init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 				      init_ctrl, unskipped_mask);
@@ -620,13 +637,28 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 	}
 
       if (!init_ctrl)
-	/* First iteration is full.  */
-	init_ctrl = build_minus_one_cst (ctrl_type);
+	{
+	  /* First iteration is full.  */
+	  if (use_masks_p)
+	    init_ctrl = build_minus_one_cst (ctrl_type);
+	  else
+	    init_ctrl = length_limit;
+	}
 
       /* Get the control value for the next iteration of the loop.  */
-      next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
-      gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
-      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
+      if (use_masks_p)
+	{
+	  next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
+	  gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
+	  gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
+	}
+      else
+	{
+	  next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
+	  gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
+					 length_limit);
+	  gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
+	}
 
       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
     }
@@ -652,6 +684,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
   gimple_seq preheader_seq = NULL;
   gimple_seq header_seq = NULL;
 
+  bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
@@ -686,28 +719,30 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
   tree test_ctrl = NULL_TREE;
   rgroup_controls *rgc;
   unsigned int i;
-  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
-  FOR_EACH_VEC_ELT (*masks, i, rgc)
+  auto_vec<rgroup_controls> *controls = use_masks_p
+					  ? &LOOP_VINFO_MASKS (loop_vinfo)
+					  : &LOOP_VINFO_LENS (loop_vinfo);
+  FOR_EACH_VEC_ELT (*controls, i, rgc)
     if (!rgc->controls.is_empty ())
       {
 	/* First try using permutes.  This adds a single vector
 	   instruction to the loop for each mask, but needs no extra
 	   loop invariants or IVs.  */
 	unsigned int nmasks = i + 1;
-	if ((nmasks & 1) == 0)
+	if (use_masks_p && (nmasks & 1) == 0)
 	  {
-	    rgroup_controls *half_rgc = &(*masks)[nmasks / 2 - 1];
+	    rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 	    if (!half_rgc->controls.is_empty ()
 		&& vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 	      continue;
 	  }
 
 	/* See whether zero-based IV would ever generate all-false masks
-	   before wrapping around.  */
+	   or zero length before wrapping around.  */
+	unsigned nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 	bool might_wrap_p
 	  = (iv_limit == -1
-	     || (wi::min_precision (iv_limit * rgc->max_nscalars_per_iter,
-				    UNSIGNED)
+	     || (wi::min_precision (iv_limit * nitems_per_iter, UNSIGNED)
 		 > compare_precision));
 
 	/* Set up all controls for this group.  */
@@ -2568,7 +2603,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   if (vect_epilogues
       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
       && prolog_peeling >= 0
-      && known_eq (vf, lowest_vf))
+      && known_eq (vf, lowest_vf)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (epilogue_vinfo))
     {
       unsigned HOST_WIDE_INT eiters
 	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 0a9be35ddec..e933441b922 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -816,6 +816,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vectorizable (false),
     can_use_partial_vectors_p (true),
     using_partial_vectors_p (false),
+    epil_using_partial_vectors_p (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     no_data_dependencies (false),
@@ -898,6 +899,7 @@ _loop_vec_info::~_loop_vec_info ()
   free (bbs);
 
   release_vec_loop_controls (&masks);
+  release_vec_loop_controls (&lens);
   delete ivexpr_map;
   delete scan_map;
   epilogue_vinfos.release ();
@@ -1072,6 +1074,81 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   return true;
 }
 
+/* Check whether we can use vector access with length based on precison
+   comparison.  So far, to keep it simple, we only allow the case that the
+   precision of the target supported length is larger than the precision
+   required by loop niters.  */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    return false;
+
+  unsigned int max_nitems_per_iter = 1;
+  unsigned int i;
+  rgroup_controls *rgl;
+  /* Find the maximum number of items per iteration for every rgroup.  */
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+    {
+      unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+      max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+    }
+
+  /* Work out how many bits we need to represent the length limit.  */
+  unsigned int min_ni_prec
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+  /* Now use the maximum of below precisions for one suitable IV type:
+     - the IV's natural precision
+     - the precision needed to hold: the maximum number of scalar
+       iterations multiplied by the scale factor (min_ni_prec above)
+     - the Pmode precision
+
+     If min_ni_prec is less than the precision of the current niters,
+     we perfer to still use the niters type.  Prefer to use Pmode and
+     wider IV to avoid narrow conversions.  */
+
+  unsigned int ni_prec
+    = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+  min_ni_prec = MAX (min_ni_prec, ni_prec);
+  min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+  tree iv_type = NULL_TREE;
+  opt_scalar_int_mode tmode_iter;
+  FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+    {
+      scalar_mode tmode = tmode_iter.require ();
+      unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+      /* ??? Do we really want to construct one IV whose precision exceeds
+	 BITS_PER_WORD?  */
+      if (tbits > BITS_PER_WORD)
+	break;
+
+      /* Find the first available standard integral type.  */
+      if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+	{
+	  iv_type = build_nonstandard_integer_type (tbits, true);
+	  break;
+	}
+    }
+
+  if (!iv_type)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't vectorize with length-based partial vectors"
+			 " because there is no suitable iv type.\n");
+      return false;
+    }
+
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
+  return true;
+}
+
 /* Calculate the cost of one scalar iteration of the loop.  */
 static void
 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -2168,11 +2245,48 @@ start_over:
       return ok;
     }
 
-  /* Decide whether to use a fully-masked loop for this vectorization
-     factor.  */
-  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
-    = (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
-       && vect_verify_full_masking (loop_vinfo));
+  /* For now, we don't expect to mix both masking and length approaches for one
+     loop, disable it if both are recorded.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't vectorize a loop with partial vectors"
+			 " because we don't expect to mix different"
+			 " approaches with partial vectors for the"
+			 " same loop.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+
+  /* Decide whether to vectorize a loop with partial vectors for
+     this vectorization factor.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    {
+      if (param_vect_partial_vector_usage == 0)
+	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+      else if (vect_verify_full_masking (loop_vinfo)
+	       || vect_verify_loop_lens (loop_vinfo))
+	{
+	  /* The epilogue and other known niters less than VF
+	    cases can still use vector access with length fully.  */
+	  if (param_vect_partial_vector_usage == 1
+	      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	      && !vect_known_niters_smaller_than_vf (loop_vinfo))
+	    {
+	      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	    }
+	  else
+	    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
+      else
+	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+  else
+    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
   if (dump_enabled_p ())
     {
       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
@@ -2404,6 +2518,7 @@ again:
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset accumulated rgroup information.  */
   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -2690,7 +2805,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 		lowest_th = ordered_min (lowest_th, th);
 	    }
 	  else
-	    delete loop_vinfo;
+	    {
+	      delete loop_vinfo;
+	      loop_vinfo = opt_loop_vec_info::success (NULL);
+	    }
 
 	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
 	     enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2715,6 +2833,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       else
 	{
 	  delete loop_vinfo;
+	  loop_vinfo = opt_loop_vec_info::success (NULL);
 	  if (fatal)
 	    {
 	      gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2722,6 +2841,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 	    }
 	}
 
+      /* Handle the case that the original loop can use partial
+	 vectorization, but want to only adopt it for the epilogue.
+	 The retry should be in the same mode as original.  */
+      if (vect_epilogues
+	  && loop_vinfo
+	  && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+	{
+	  gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+		      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Re-trying analysis with same vector mode"
+			     " %s for epilogue with partial vectors.\n",
+			     GET_MODE_NAME (loop_vinfo->vector_mode));
+	  continue;
+	}
+
       if (mode_i < vector_modes.length ()
 	  && VECTOR_MODE_P (autodetected_vector_mode)
 	  && (related_vector_mode (vector_modes[mode_i],
@@ -3562,6 +3698,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 			    target_cost_data, num_masks - 1, vector_stmt,
 			    NULL, NULL_TREE, 0, vect_body);
     }
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      peel_iters_prologue = 0;
+      peel_iters_epilogue = 0;
+    }
   else if (npeel < 0)
     {
       peel_iters_prologue = assumed_vf / 2;
@@ -8194,6 +8335,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
       rgm->type = truth_type_for (vectype);
+      rgm->factor = 1;
     }
 }
 
@@ -8246,6 +8388,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   return mask;
 }
 
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+   lengths for controlling an operation on VECTYPE.  The operation splits
+   each element of VECTYPE into FACTOR separate subelements, measuring the
+   length as a number of these subelements.  */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+		      unsigned int nvectors, tree vectype, unsigned int factor)
+{
+  gcc_assert (nvectors != 0);
+  if (lens->length () < nvectors)
+    lens->safe_grow_cleared (nvectors);
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* The number of scalars per iteration, scalar occupied bytes and
+     the number of vectors are both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      /* For now, we only support cases in which all loads and stores fall back
+	 to VnQI or none do.  */
+      gcc_assert (!rgl->max_nscalars_per_iter
+		  || (rgl->factor == 1 && factor == 1)
+		  || (rgl->max_nscalars_per_iter * rgl->factor
+		      == nscalars_per_iter * factor));
+      rgl->max_nscalars_per_iter = nscalars_per_iter;
+      rgl->type = vectype;
+      rgl->factor = factor;
+    }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+		   unsigned int nvectors, unsigned int index)
+{
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* Populate the rgroup's len array, if this is the first time we've
+     used it.  */
+  if (rgl->controls.is_empty ())
+    {
+      rgl->controls.safe_grow_cleared (nvectors);
+      for (unsigned int i = 0; i < nvectors; ++i)
+	{
+	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+	  gcc_assert (len_type != NULL_TREE);
+	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+	  /* Provide a dummy definition until the real one is available.  */
+	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+	  rgl->controls[i] = len;
+	}
+    }
+
+  return rgl->controls[index];
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6730cae8085..31af46ae19c 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1713,29 +1713,58 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       return;
     }
 
-  machine_mode mask_mode;
-  if (!VECTOR_MODE_P (vecmode)
-      || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
-      || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
+  if (!VECTOR_MODE_P (vecmode))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "can't use a fully-masked loop because the target"
-			 " doesn't have the appropriate masked load or"
-			 " store.\n");
+			 "can't operate on partial vectors when emulating"
+			 " vector operations.\n");
       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
       return;
     }
+
   /* We might load more scalars than we need for permuting SLP loads.
      We checked in get_group_load_store_type that the extra elements
      don't leak into a new vector.  */
+  auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits)
+  {
+    unsigned int nvectors;
+    if (can_div_away_from_zero_p (size, nunits, &nvectors))
+      return nvectors;
+    gcc_unreachable ();
+  };
+
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  unsigned int nvectors;
-  if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
-    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
-  else
-    gcc_unreachable ();
+  machine_mode mask_mode;
+  bool using_partial_vectors_p = false;
+  if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
+      && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
+    {
+      unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
+      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
+      using_partial_vectors_p = true;
+    }
+
+  machine_mode vmode;
+  if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
+    {
+      unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits);
+      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+      unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
+      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
+      using_partial_vectors_p = true;
+    }
+
+  if (!using_partial_vectors_p)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "can't operate on partial vectors because the"
+			 " target doesn't have the appropriate partial"
+			 " vectorization load or store.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
 }
 
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
@@ -7694,6 +7723,14 @@ vectorizable_store (vec_info *vinfo,
     = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
        ? &LOOP_VINFO_MASKS (loop_vinfo)
        : NULL);
+  vec_loop_lens *loop_lens
+    = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+       ? &LOOP_VINFO_LENS (loop_vinfo)
+       : NULL);
+
+  /* Shouldn't go with length-based approach if fully masked.  */
+  gcc_assert (!loop_lens || !loop_masks);
+
   /* Targets with store-lane instructions must not require explicit
      realignment.  vect_supportable_dr_alignment always returns either
      dr_aligned or dr_unaligned_supported for masked operations.  */
@@ -8033,6 +8070,41 @@ vectorizable_store (vec_info *vinfo,
 		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
 		  new_stmt = call;
 		}
+	      else if (loop_lens)
+		{
+		  tree final_len
+		    = vect_get_loop_len (loop_vinfo, loop_lens,
+					 vec_num * ncopies, vec_num * j + i);
+		  align = least_bit_hwi (misalign | align);
+		  tree ptr = build_int_cst (ref_type, align);
+		  machine_mode vmode = TYPE_MODE (vectype);
+		  opt_machine_mode new_ovmode
+		    = get_len_load_store_mode (vmode, false);
+		  machine_mode new_vmode = new_ovmode.require ();
+		  /* Need conversion if it's wrapped with VnQI.  */
+		  if (vmode != new_vmode)
+		    {
+		      tree new_vtype
+			= build_vector_type_for_mode (unsigned_intQI_type_node,
+						      new_vmode);
+		      tree var
+			= vect_get_new_ssa_name (new_vtype, vect_simple_var);
+		      vec_oprnd
+			= build1 (VIEW_CONVERT_EXPR, new_vtype, vec_oprnd);
+		      gassign *new_stmt
+			= gimple_build_assign (var, VIEW_CONVERT_EXPR,
+					       vec_oprnd);
+		      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+						   gsi);
+		      vec_oprnd = var;
+		    }
+		  gcall *call
+		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
+						  ptr, final_len, vec_oprnd);
+		  gimple_call_set_nothrow (call, true);
+		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+		  new_stmt = call;
+		}
 	      else
 		{
 		  data_ref = fold_build2 (MEM_REF, vectype,
@@ -8577,7 +8649,7 @@ vectorizable_load (vec_info *vinfo,
       unsigned HOST_WIDE_INT cst_offset = 0;
       tree dr_offset;
 
-      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
       gcc_assert (!nested_in_vect_loop);
 
       if (grouped_load)
@@ -8859,6 +8931,14 @@ vectorizable_load (vec_info *vinfo,
     = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
        ? &LOOP_VINFO_MASKS (loop_vinfo)
        : NULL);
+  vec_loop_lens *loop_lens
+    = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+       ? &LOOP_VINFO_LENS (loop_vinfo)
+       : NULL);
+
+  /* Shouldn't go with length-based approach if fully masked.  */
+  gcc_assert (!loop_lens || !loop_masks);
+
   /* Targets with store-lane instructions must not require explicit
      realignment.  vect_supportable_dr_alignment always returns either
      dr_aligned or dr_unaligned_supported for masked operations.  */
@@ -9247,6 +9327,43 @@ vectorizable_load (vec_info *vinfo,
 			new_stmt = call;
 			data_ref = NULL_TREE;
 		      }
+		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
+		      {
+			tree final_len
+			  = vect_get_loop_len (loop_vinfo, loop_lens,
+					       vec_num * ncopies,
+					       vec_num * j + i);
+			align = least_bit_hwi (misalign | align);
+			tree ptr = build_int_cst (ref_type, align);
+			gcall *call
+			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
+							dataref_ptr, ptr,
+							final_len);
+			gimple_call_set_nothrow (call, true);
+			new_stmt = call;
+			data_ref = NULL_TREE;
+
+			/* Need conversion if it's wrapped with VnQI.  */
+			machine_mode vmode = TYPE_MODE (vectype);
+			opt_machine_mode new_ovmode
+			  = get_len_load_store_mode (vmode, true);
+			machine_mode new_vmode = new_ovmode.require ();
+			if (vmode != new_vmode)
+			  {
+			    tree qi_type = unsigned_intQI_type_node;
+			    tree new_vtype
+			      = build_vector_type_for_mode (qi_type, new_vmode);
+			    tree var = vect_get_new_ssa_name (new_vtype,
+							      vect_simple_var);
+			    gimple_set_lhs (call, var);
+			    vect_finish_stmt_generation (vinfo, stmt_info, call,
+							 gsi);
+			    tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
+			    new_stmt
+			      = gimple_build_assign (vec_dest,
+						     VIEW_CONVERT_EXPR, op);
+			  }
+		      }
 		    else
 		      {
 			tree ltype = vectype;
@@ -11967,3 +12084,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
   *nunits_vectype_out = nunits_vectype;
   return opt_result::success ();
 }
+
+/* Generate and return statement sequence that sets vector length LEN that is:
+
+   min_of_start_and_end = min (START_INDEX, END_INDEX);
+   left_len = END_INDEX - min_of_start_and_end;
+   rhs = min (left_len, LEN_LIMIT);
+   LEN = rhs;  */
+
+gimple_seq
+vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)
+{
+  gimple_seq stmts = NULL;
+  tree len_type = TREE_TYPE (len);
+  gcc_assert (TREE_TYPE (start_index) == len_type);
+
+  tree min = gimple_build (&stmts, MIN_EXPR, len_type, start_index, end_index);
+  tree left_len = gimple_build (&stmts, MINUS_EXPR, len_type, end_index, min);
+  tree rhs = gimple_build (&stmts, MIN_EXPR, len_type, left_len, len_limit);
+  gimple* stmt = gimple_build_assign (len, rhs);
+  gimple_seq_add_stmt (&stmts, stmt);
+
+  return stmts;
+}
+
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 91d3291ab56..5466c78c20b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -480,14 +480,21 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
    first level being indexed by nV - 1 (since nV == 0 doesn't exist) and
    the second being indexed by the mask index 0 <= i < nV.  */
 
-/* The controls (like masks) needed by rgroups with nV vectors,
+/* The controls (like masks or lengths) needed by rgroups with nV vectors,
    according to the description above.  */
 struct rgroup_controls {
   /* The largest nS for all rgroups that use these controls.  */
   unsigned int max_nscalars_per_iter;
 
-  /* The type of control to use, based on the highest nS recorded above.
-     For mask-based approach, it's used for mask_type.  */
+  /* For the largest nS recorded above, the loop controls divide each scalar
+     into FACTOR equal-sized pieces.  This is useful if we need to split
+     element-based accesses into byte-based accesses.  */
+  unsigned int factor;
+
+  /* This is a vector type with MAX_NSCALARS_PER_ITER * VF / nV elements.
+     For mask-based controls, it is the type of the masks in CONTROLS.
+     For length-based controls, it can be any vector type that has the
+     specified number of elements; the type of the elements doesn't matter.  */
   tree type;
 
   /* A vector of nV controls, in iteration order.  */
@@ -496,6 +503,8 @@ struct rgroup_controls {
 
 typedef auto_vec<rgroup_controls> vec_loop_masks;
 
+typedef auto_vec<rgroup_controls> vec_loop_lens;
+
 typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
 
 /*-----------------------------------------------------------------*/
@@ -543,6 +552,10 @@ public:
      on inactive scalars.  */
   vec_loop_masks masks;
 
+  /* The lengths that a loop with length should use to avoid operating
+     on inactive scalars.  */
+  vec_loop_lens lens;
+
   /* Set of scalar conditions that have loop mask applied.  */
   scalar_cond_masked_set_type scalar_cond_masked_set;
 
@@ -651,6 +664,10 @@ public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use partially-populated vectors for the
+     epilogue of loop.  */
+  bool epil_using_partial_vectors_p;
+
   /* When we have grouped data accesses with gaps, we may introduce invalid
      memory accesses.  We peel the last iteration of the loop to prevent
      this.  */
@@ -714,9 +731,12 @@ public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
+  (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
 #define LOOP_VINFO_MASKS(L)                (L)->masks
+#define LOOP_VINFO_LENS(L)                 (L)->lens
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L)  (L)->rgroup_compare_type
 #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
@@ -754,6 +774,10 @@ public:
   (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L)	\
    && !LOOP_VINFO_MASKS (L).is_empty ())
 
+#define LOOP_VINFO_FULLY_WITH_LENGTH_P(L)	\
+  (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L)	\
+   && !LOOP_VINFO_LENS (L).is_empty ())
+
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)	\
   ((L)->may_misalign_stmts.length () > 0)
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L)		\
@@ -1953,6 +1977,11 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
 				   unsigned int, tree, tree);
 extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
+extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+				  tree, unsigned int);
+extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+			       unsigned int);
+extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 
 /* Drive for loop transformation stage.  */
-- 
2.30.2