LCOV - code coverage report
Current view: top level - gcc - gimple-loop-jam.c (source / functions) Hit Total Coverage
Test: gcc.info Lines: 207 212 97.6 %
Date: 2020-04-04 11:58:09 Functions: 10 10 100.0 %
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed Branches: 0 0 -

           Branch data     Line data    Source code
       1                 :            : /* Loop unroll-and-jam.
       2                 :            :    Copyright (C) 2017-2020 Free Software Foundation, Inc.
       3                 :            : 
       4                 :            : This file is part of GCC.
       5                 :            : 
       6                 :            : GCC is free software; you can redistribute it and/or modify it
       7                 :            : under the terms of the GNU General Public License as published by the
       8                 :            : Free Software Foundation; either version 3, or (at your option) any
       9                 :            : later version.
      10                 :            : 
      11                 :            : GCC is distributed in the hope that it will be useful, but WITHOUT
      12                 :            : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
      13                 :            : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      14                 :            : for more details.
      15                 :            : 
      16                 :            : You should have received a copy of the GNU General Public License
      17                 :            : along with GCC; see the file COPYING3.  If not see
      18                 :            : <http://www.gnu.org/licenses/>.  */
      19                 :            : 
      20                 :            : #include "config.h"
      21                 :            : #include "system.h"
      22                 :            : #include "coretypes.h"
      23                 :            : #include "tree-pass.h"
      24                 :            : #include "backend.h"
      25                 :            : #include "tree.h"
      26                 :            : #include "gimple.h"
      27                 :            : #include "ssa.h"
      28                 :            : #include "fold-const.h"
      29                 :            : #include "tree-cfg.h"
      30                 :            : #include "tree-ssa.h"
      31                 :            : #include "tree-ssa-loop-niter.h"
      32                 :            : #include "tree-ssa-loop.h"
      33                 :            : #include "tree-ssa-loop-manip.h"
      34                 :            : #include "cfgloop.h"
      35                 :            : #include "tree-scalar-evolution.h"
      36                 :            : #include "gimple-iterator.h"
      37                 :            : #include "cfghooks.h"
      38                 :            : #include "tree-data-ref.h"
      39                 :            : #include "tree-ssa-loop-ivopts.h"
      40                 :            : #include "tree-vectorizer.h"
      41                 :            : 
      42                 :            : /* Unroll and Jam transformation
      43                 :            :    
      44                 :            :    This is a combination of two transformations, where the second
      45                 :            :    is not always valid.  It's applicable if a loop nest has redundancies
      46                 :            :    over the iterations of an outer loop while not having that with
      47                 :            :    an inner loop.
      48                 :            : 
      49                 :            :    Given this nest:
      50                 :            :        for (i) {
      51                 :            :          for (j) {
      52                 :            :            B(i,j)
      53                 :            :          }
      54                 :            :        }
      55                 :            : 
      56                 :            :    first unroll:
      57                 :            :        for (i by 2) {
      58                 :            :          for (j) {
      59                 :            :            B(i,j)
      60                 :            :          }
      61                 :            :          for (j) {
      62                 :            :            B(i+1,j)
      63                 :            :          }
      64                 :            :        }
      65                 :            : 
      66                 :            :    then fuse the two adjacent inner loops resulting from that:
      67                 :            :        for (i by 2) {
      68                 :            :          for (j) {
      69                 :            :            B(i,j)
      70                 :            :            B(i+1,j)
      71                 :            :          }
      72                 :            :        }
      73                 :            : 
      74                 :            :    As the order of evaluations of the body B changes this is valid
      75                 :            :    only in certain situations: all distance vectors need to be forward.
      76                 :            :    Additionally if there are multiple induction variables than just
      77                 :            :    a counting control IV (j above) we can also deal with some situations.
      78                 :            : 
      79                 :            :    The validity is checked by unroll_jam_possible_p, and the data-dep
      80                 :            :    testing below.
      81                 :            : 
      82                 :            :    A trivial example where the fusion is wrong would be when
      83                 :            :    B(i,j) == x[j-1] = x[j];
      84                 :            :        for (i by 2) {
      85                 :            :          for (j) {
      86                 :            :            x[j-1] = x[j];
      87                 :            :          }
      88                 :            :          for (j) {
      89                 :            :            x[j-1] = x[j];
      90                 :            :          }
      91                 :            :        }  effect: move content to front by two elements
      92                 :            :        -->
      93                 :            :        for (i by 2) {
      94                 :            :          for (j) {
      95                 :            :            x[j-1] = x[j];
      96                 :            :            x[j-1] = x[j];
      97                 :            :          }
      98                 :            :        }  effect: move content to front by one element
      99                 :            : */
     100                 :            : 
     101                 :            : /* Modify the loop tree for the fact that all code once belonging
     102                 :            :    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
     103                 :            : 
     104                 :            : static void
     105                 :         65 : merge_loop_tree (class loop *loop, class loop *old)
     106                 :            : {
     107                 :         65 :   basic_block *bbs;
     108                 :         65 :   int i, n;
     109                 :         65 :   class loop *subloop;
     110                 :         65 :   edge e;
     111                 :         65 :   edge_iterator ei;
     112                 :            : 
     113                 :            :   /* Find its nodes.  */
     114                 :         65 :   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
     115                 :         65 :   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
     116                 :            : 
     117                 :        568 :   for (i = 0; i < n; i++)
     118                 :            :     {
     119                 :            :       /* If the block was direct child of OLD loop it's now part
     120                 :            :          of LOOP.  If it was outside OLD, then it moved into LOOP
     121                 :            :          as well.  This avoids changing the loop father for BBs
     122                 :            :          in inner loops of OLD.  */
     123                 :        941 :       if (bbs[i]->loop_father == old
     124                 :       1249 :           || loop_depth (bbs[i]->loop_father) < loop_depth (old))
     125                 :            :         {
     126                 :        438 :           remove_bb_from_loops (bbs[i]);
     127                 :        438 :           add_bb_to_loop (bbs[i], loop);
     128                 :        438 :           continue;
     129                 :            :         }
     130                 :            : 
     131                 :            :       /* If we find a direct subloop of OLD, move it to LOOP.  */
     132                 :         65 :       subloop = bbs[i]->loop_father;
     133                 :        130 :       if (loop_outer (subloop) == old && subloop->header == bbs[i])
     134                 :            :         {
     135                 :          0 :           flow_loop_tree_node_remove (subloop);
     136                 :          0 :           flow_loop_tree_node_add (loop, subloop);
     137                 :            :         }
     138                 :            :     }
     139                 :            : 
     140                 :            :   /* Update the information about loop exit edges.  */
     141                 :        568 :   for (i = 0; i < n; i++)
     142                 :            :     {
     143                 :       1087 :       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
     144                 :            :         {
     145                 :        584 :           rescan_loop_exit (e, false, false);
     146                 :            :         }
     147                 :            :     }
     148                 :            : 
     149                 :         65 :   loop->num_nodes = n;
     150                 :            : 
     151                 :         65 :   free (bbs);
     152                 :         65 : }
     153                 :            : 
     154                 :            : /* BB is part of the outer loop of an unroll-and-jam situation.
     155                 :            :    Check if any statements therein would prevent the transformation.  */
     156                 :            : 
     157                 :            : static bool
     158                 :       4041 : bb_prevents_fusion_p (basic_block bb)
     159                 :            : {
     160                 :       4041 :   gimple_stmt_iterator gsi;
     161                 :            :   /* BB is duplicated by outer unrolling and then all N-1 first copies
     162                 :            :      move into the body of the fused inner loop.  If BB exits the outer loop
     163                 :            :      the last copy still does so, and the first N-1 copies are cancelled
     164                 :            :      by loop unrolling, so also after fusion it's the exit block.
     165                 :            :      But there might be other reasons that prevent fusion:
     166                 :            :        * stores or unknown side-effects prevent fusion
     167                 :            :        * loads don't
     168                 :            :        * computations into SSA names: these aren't problematic.  Their
     169                 :            :          result will be unused on the exit edges of the first N-1 copies
     170                 :            :          (those aren't taken after unrolling).  If they are used on the
     171                 :            :          other edge (the one leading to the outer latch block) they are
     172                 :            :          loop-carried (on the outer loop) and the Nth copy of BB will
     173                 :            :          compute them again (i.e. the first N-1 copies will be dead).  */
     174                 :      15807 :   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     175                 :            :     {
     176                 :       7864 :       gimple *g = gsi_stmt (gsi);
     177                 :      12822 :       if (gimple_vdef (g) || gimple_has_side_effects (g))
     178                 :        139 :         return true;
     179                 :            :     }
     180                 :            :   return false;
     181                 :            : }
     182                 :            : 
     183                 :            : /* Given an inner loop LOOP (of some OUTER loop) determine if
     184                 :            :    we can safely fuse copies of it (generated by outer unrolling).
     185                 :            :    If so return true, otherwise return false.  */
     186                 :            : 
     187                 :            : static bool
     188                 :       9347 : unroll_jam_possible_p (class loop *outer, class loop *loop)
     189                 :            : {
     190                 :       9347 :   basic_block *bbs;
     191                 :       9347 :   int i, n;
     192                 :       9347 :   class tree_niter_desc niter;
     193                 :            : 
     194                 :            :   /* When fusing the loops we skip the latch block
     195                 :            :      of the first one, so it mustn't have any effects to
     196                 :            :      preserve.  */
     197                 :       9347 :   if (!empty_block_p (loop->latch))
     198                 :            :     return false;
     199                 :            : 
     200                 :       8905 :   if (!single_exit (loop))
     201                 :            :     return false;
     202                 :            : 
     203                 :            :   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
     204                 :       6340 :   if (outer->inner != loop || loop->next)
     205                 :            :     return false;
     206                 :            : 
     207                 :            :   /* Prevent head-controlled inner loops, that we usually have.
     208                 :            :      The guard block would need to be accepted
     209                 :            :      (invariant condition either entering or skipping the loop),
     210                 :            :      without also accepting arbitrary control flow.  When unswitching
     211                 :            :      ran before us (as with -O3) this won't be a problem because its
     212                 :            :      outer loop unswitching will have moved out the invariant condition.
     213                 :            : 
     214                 :            :      If we do that we need to extend fuse_loops() to cope with this
     215                 :            :      by threading through the (still invariant) copied condition
     216                 :            :      between the two loop copies.  */
     217                 :       2968 :   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
     218                 :            :     return false;
     219                 :            : 
     220                 :            :   /* The number of iterations of the inner loop must be loop invariant
     221                 :            :      with respect to the outer loop.  */
     222                 :       1957 :   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
     223                 :            :                                  false, true)
     224                 :       1682 :       || niter.cmp == ERROR_MARK
     225                 :       1682 :       || !integer_zerop (niter.may_be_zero)
     226                 :       3484 :       || !expr_invariant_in_loop_p (outer, niter.niter))
     227                 :        453 :     return false;
     228                 :            : 
     229                 :            :   /* If the inner loop produces any values that are used inside the
     230                 :            :      outer loop (except the virtual op) then it can flow
     231                 :            :      back (perhaps indirectly) into the inner loop.  This prevents
     232                 :            :      fusion: without fusion the value at the last iteration is used,
     233                 :            :      with fusion the value after the initial iteration is used.
     234                 :            : 
     235                 :            :      If all uses are outside the outer loop this doesn't prevent fusion;
     236                 :            :      the value of the last iteration is still used (and the values from
     237                 :            :      all intermediate iterations are dead).  */
     238                 :       1504 :   gphi_iterator psi;
     239                 :       1504 :   for (psi = gsi_start_phis (single_exit (loop)->dest);
     240                 :       2000 :        !gsi_end_p (psi); gsi_next (&psi))
     241                 :            :     {
     242                 :        658 :       imm_use_iterator imm_iter;
     243                 :        658 :       use_operand_p use_p;
     244                 :        658 :       tree op = gimple_phi_result (psi.phi ());
     245                 :       1316 :       if (virtual_operand_p (op))
     246                 :        480 :         continue;
     247                 :        195 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
     248                 :            :         {
     249                 :        179 :           gimple *use_stmt = USE_STMT (use_p);
     250                 :        179 :           if (!is_gimple_debug (use_stmt)
     251                 :        179 :               && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
     252                 :        162 :             return false;
     253                 :            :         }
     254                 :            :     }
     255                 :            : 
     256                 :            :   /* And check blocks belonging to just outer loop.  */
     257                 :       1342 :   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
     258                 :       1342 :   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
     259                 :            : 
     260                 :       7846 :   for (i = 0; i < n; i++)
     261                 :       6643 :     if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
     262                 :            :       break;
     263                 :       1342 :   free (bbs);
     264                 :       1342 :   if (i != n)
     265                 :            :     return false;
     266                 :            : 
     267                 :            :   /* For now we can safely fuse copies of LOOP only if all
     268                 :            :      loop carried variables are inductions (or the virtual op).
     269                 :            : 
     270                 :            :      We could handle reductions as well (the initial value in the second
     271                 :            :      body would be the after-iter value of the first body) if it's over
     272                 :            :      an associative and commutative operation.  We wouldn't
     273                 :            :      be able to handle unknown cycles.  */
     274                 :       3485 :   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
     275                 :            :     {
     276                 :       2352 :       affine_iv iv;
     277                 :       2352 :       tree op = gimple_phi_result (psi.phi ());
     278                 :            : 
     279                 :       4704 :       if (virtual_operand_p (op))
     280                 :       1099 :         continue;
     281                 :       1253 :       if (!simple_iv (loop, loop, op, &iv, true))
     282                 :         70 :         return false;
     283                 :            :       /* The inductions must be regular, loop invariant step and initial
     284                 :            :          value.  */
     285                 :       1248 :       if (!expr_invariant_in_loop_p (outer, iv.step)
     286                 :       1248 :           || !expr_invariant_in_loop_p (outer, iv.base))
     287                 :         65 :         return false;
     288                 :            :       /* XXX With more effort we could also be able to deal with inductions
     289                 :            :          where the initial value is loop variant but a simple IV in the
     290                 :            :          outer loop.  The initial value for the second body would be
     291                 :            :          the original initial value plus iv.base.step.  The next value
     292                 :            :          for the fused loop would be the original next value of the first
     293                 :            :          copy, _not_ the next value of the second body.  */
     294                 :            :     }
     295                 :            : 
     296                 :            :   return true;
     297                 :            : }
     298                 :            : 
     299                 :            : /* Fuse LOOP with all further neighbors.  The loops are expected to
     300                 :            :    be in appropriate form.  */
     301                 :            : 
     302                 :            : static void
     303                 :         65 : fuse_loops (class loop *loop)
     304                 :            : {
     305                 :         65 :   class loop *next = loop->next;
     306                 :            : 
     307                 :        130 :   while (next)
     308                 :            :     {
     309                 :         65 :       edge e;
     310                 :            : 
     311                 :         65 :       remove_branch (single_pred_edge (loop->latch));
     312                 :            :       /* Make delete_basic_block not fiddle with the loop structure.  */
     313                 :         65 :       basic_block oldlatch = loop->latch;
     314                 :         65 :       loop->latch = NULL;
     315                 :         65 :       delete_basic_block (oldlatch);
     316                 :         65 :       e = redirect_edge_and_branch (loop_latch_edge (next),
     317                 :            :                                     loop->header);
     318                 :         65 :       loop->latch = e->src;
     319                 :         65 :       flush_pending_stmts (e);
     320                 :            : 
     321                 :         65 :       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
     322                 :            : 
     323                 :            :       /* The PHI nodes of the second body (single-argument now)
     324                 :            :          need adjustments to use the right values: either directly
     325                 :            :          the value of the corresponding PHI in the first copy or
     326                 :            :          the one leaving the first body which unrolling did for us.
     327                 :            : 
     328                 :            :          See also unroll_jam_possible_p() for further possibilities.  */
     329                 :         65 :       gphi_iterator psi_first, psi_second;
     330                 :         65 :       e = single_pred_edge (next->header);
     331                 :         65 :       for (psi_first = gsi_start_phis (loop->header),
     332                 :         65 :            psi_second = gsi_start_phis (next->header);
     333                 :        179 :            !gsi_end_p (psi_first);
     334                 :        114 :            gsi_next (&psi_first), gsi_next (&psi_second))
     335                 :            :         {
     336                 :        114 :           gphi *phi_first = psi_first.phi ();
     337                 :        114 :           gphi *phi_second = psi_second.phi ();
     338                 :        114 :           tree firstop = gimple_phi_result (phi_first);
     339                 :            :           /* The virtual operand is correct already as it's
     340                 :            :              always live at exit, hence has a LCSSA node and outer
     341                 :            :              loop unrolling updated SSA form.  */
     342                 :        228 :           if (virtual_operand_p (firstop))
     343                 :         49 :             continue;
     344                 :            : 
     345                 :            :           /* Due to unroll_jam_possible_p() we know that this is
     346                 :            :              an induction.  The second body goes over the same
     347                 :            :              iteration space.  */
     348                 :         65 :           add_phi_arg (phi_second, firstop, e,
     349                 :            :                        gimple_location (phi_first));
     350                 :            :         }
     351                 :         65 :       gcc_assert (gsi_end_p (psi_second));
     352                 :            : 
     353                 :         65 :       merge_loop_tree (loop, next);
     354                 :         65 :       gcc_assert (!next->num_nodes);
     355                 :         65 :       class loop *ln = next->next;
     356                 :         65 :       delete_loop (next);
     357                 :         65 :       next = ln;
     358                 :            :     }
     359                 :         65 :   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
     360                 :         65 : }
     361                 :            : 
     362                 :            : /* Return true if any of the access functions for dataref A
     363                 :            :    isn't invariant with respect to loop LOOP_NEST.  */
     364                 :            : static bool
     365                 :       3746 : any_access_function_variant_p (const struct data_reference *a,
     366                 :            :                                const class loop *loop_nest)
     367                 :            : {
     368                 :       3746 :   unsigned int i;
     369                 :       3746 :   vec<tree> fns = DR_ACCESS_FNS (a);
     370                 :       3746 :   tree t;
     371                 :            : 
     372                 :       4703 :   FOR_EACH_VEC_ELT (fns, i, t)
     373                 :       4699 :     if (!evolution_function_is_invariant_p (t, loop_nest->num))
     374                 :            :       return true;
     375                 :            : 
     376                 :            :   return false;
     377                 :            : }
     378                 :            : 
     379                 :            : /* Returns true if the distance in DDR can be determined and adjusts
     380                 :            :    the unroll factor in *UNROLL to make unrolling valid for that distance.
     381                 :            :    Otherwise return false.  DDR is with respect to the outer loop of INNER.
     382                 :            : 
     383                 :            :    If this data dep can lead to a removed memory reference, increment
     384                 :            :    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
     385                 :            :    for this to happen.  */
     386                 :            : 
     387                 :            : static bool
     388                 :       2418 : adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
     389                 :            :                       unsigned *unroll, unsigned *profit_unroll,
     390                 :            :                       unsigned *removed)
     391                 :            : {
     392                 :       2418 :   bool ret = false;
     393                 :       2418 :   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
     394                 :            :     {
     395                 :       2418 :       if (DDR_NUM_DIST_VECTS (ddr) == 0)
     396                 :       2418 :         return false;
     397                 :            :       unsigned i;
     398                 :            :       lambda_vector dist_v;
     399                 :       3739 :       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
     400                 :            :         {
     401                 :            :           /* A distance (a,b) is at worst transformed into (a/N,b) by the
     402                 :            :              unrolling (factor N), so the transformation is valid if
     403                 :            :              a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
     404                 :            :              factor needs to be limited so that the first condition holds.
     405                 :            :              That may limit the factor down to zero in the worst case.  */
     406                 :       2442 :           int dist = dist_v[0];
     407                 :       2442 :           if (dist < 0)
     408                 :          0 :             gcc_unreachable ();
     409                 :       2442 :           else if ((unsigned)dist >= *unroll)
     410                 :            :             ;
     411                 :       7194 :           else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     412                 :            :             {
     413                 :            :               /* We have (a,0) with a < N, so this will be transformed into
     414                 :            :                  (0,0) after unrolling by N.  This might potentially be a
     415                 :            :                  problem, if it's not a read-read dependency.  */
     416                 :       2339 :               if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
     417                 :            :                 ;
     418                 :            :               else
     419                 :            :                 {
     420                 :            :                   /* So, at least one is a write, and we might reduce the
     421                 :            :                      distance vector to (0,0).  This is still no problem
     422                 :            :                      if both data-refs are affine with respect to the inner
     423                 :            :                      loops.  But if one of them is invariant with respect
     424                 :            :                      to an inner loop our reordering implicit in loop fusion
     425                 :            :                      corrupts the program, as our data dependences don't
     426                 :            :                      capture this.  E.g. for:
     427                 :            :                        for (0 <= i < n)
     428                 :            :                          for (0 <= j < m)
     429                 :            :                            a[i][0] = a[i+1][0] + 2;    // (1)
     430                 :            :                            b[i][j] = b[i+1][j] + 2;    // (2)
     431                 :            :                      the distance vector for both statements is (-1,0),
     432                 :            :                      but exchanging the order for (2) is okay, while
     433                 :            :                      for (1) it is not.  To see this, write out the original
     434                 :            :                      accesses (assume m is 2):
     435                 :            :                        a i j original
     436                 :            :                        0 0 0 r a[1][0] b[1][0]
     437                 :            :                        1 0 0 w a[0][0] b[0][0]
     438                 :            :                        2 0 1 r a[1][0] b[1][1]
     439                 :            :                        3 0 1 w a[0][0] b[0][1]
     440                 :            :                        4 1 0 r a[2][0] b[2][0]
     441                 :            :                        5 1 0 w a[1][0] b[1][0]
     442                 :            :                      after unroll-by-2 and fusion the accesses are done in
     443                 :            :                      this order (from column a): 0,1, 4,5, 2,3, i.e. this:
     444                 :            :                        a i j transformed
     445                 :            :                        0 0 0 r a[1][0] b[1][0]
     446                 :            :                        1 0 0 w a[0][0] b[0][0]
     447                 :            :                        4 1 0 r a[2][0] b[2][0]
     448                 :            :                        5 1 0 w a[1][0] b[1][0]
     449                 :            :                        2 0 1 r a[1][0] b[1][1]  
     450                 :            :                        3 0 1 w a[0][0] b[0][1]
     451                 :            :                      Note how access 2 accesses the same element as access 5
     452                 :            :                      for array 'a' but not for array 'b'.  */
     453                 :       1875 :                   if (any_access_function_variant_p (DDR_A (ddr), inner)
     454                 :       1875 :                       && any_access_function_variant_p (DDR_B (ddr), inner))
     455                 :            :                     ;
     456                 :            :                   else
     457                 :            :                     /* And if any dataref of this pair is invariant with
     458                 :            :                        respect to the inner loop, we have no chance than
     459                 :            :                        to reduce the unroll factor.  */
     460                 :          4 :                     *unroll = dist;
     461                 :            :                 }
     462                 :            :             }
     463                 :         59 :           else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     464                 :            :             ;
     465                 :            :           else
     466                 :         37 :             *unroll = dist;
     467                 :            : 
     468                 :            :           /* With a distance (a,0) it's always profitable to unroll-and-jam
     469                 :            :              (by a+1), because one memory reference will go away.  With
     470                 :            :              (a,b) and b != 0 that's less clear.  We will increase the
     471                 :            :              number of streams without lowering the number of mem refs.
     472                 :            :              So for now only handle the first situation.  */
     473                 :       7326 :           if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     474                 :            :             {
     475                 :       2341 :               *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
     476                 :       2341 :               (*removed)++;
     477                 :            :             }
     478                 :            : 
     479                 :       2442 :           ret = true;
     480                 :            :         }
     481                 :            :     }
     482                 :            :   return ret;
     483                 :            : }
     484                 :            : 
     485                 :            : /* Main entry point for the unroll-and-jam transformation
     486                 :            :    described above.  */
     487                 :            : 
     488                 :            : static unsigned int
     489                 :      17678 : tree_loop_unroll_and_jam (void)
     490                 :            : {
     491                 :      17678 :   class loop *loop;
     492                 :      17678 :   bool changed = false;
     493                 :            : 
     494                 :      17678 :   gcc_assert (scev_initialized_p ());
     495                 :            : 
     496                 :            :   /* Go through all innermost loops.  */
     497                 :      57905 :   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
     498                 :            :     {
     499                 :      40227 :       class loop *outer = loop_outer (loop);
     500                 :            : 
     501                 :      40227 :       if (loop_depth (loop) < 2
     502                 :      40227 :           || optimize_loop_nest_for_size_p (outer))
     503                 :      30880 :         continue;
     504                 :            : 
     505                 :       9347 :       if (!unroll_jam_possible_p (outer, loop))
     506                 :       8214 :         continue;
     507                 :            : 
     508                 :       1133 :       vec<data_reference_p> datarefs;
     509                 :       1133 :       vec<ddr_p> dependences;
     510                 :       1133 :       unsigned unroll_factor, profit_unroll, removed;
     511                 :       1133 :       class tree_niter_desc desc;
     512                 :       1133 :       bool unroll = false;
     513                 :            : 
     514                 :       1929 :       auto_vec<loop_p, 3> loop_nest;
     515                 :       1133 :       dependences.create (10);
     516                 :       1133 :       datarefs.create (10);
     517                 :       1133 :       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
     518                 :            :                                               &datarefs, &dependences))
     519                 :            :         {
     520                 :        277 :           if (dump_file && (dump_flags & TDF_DETAILS))
     521                 :          0 :             fprintf (dump_file, "Cannot analyze data dependencies\n");
     522                 :        277 :           free_data_refs (datarefs);
     523                 :        277 :           free_dependence_relations (dependences);
     524                 :      39708 :           continue;
     525                 :            :         }
     526                 :        856 :       if (!datarefs.length ())
     527                 :         60 :         continue;
     528                 :            : 
     529                 :        796 :       if (dump_file && (dump_flags & TDF_DETAILS))
     530                 :         14 :         dump_data_dependence_relations (dump_file, dependences);
     531                 :            : 
     532                 :        796 :       unroll_factor = (unsigned)-1;
     533                 :        796 :       profit_unroll = 1;
     534                 :        796 :       removed = 0;
     535                 :            : 
     536                 :            :       /* Check all dependencies.  */
     537                 :        796 :       unsigned i;
     538                 :        796 :       struct data_dependence_relation *ddr;
     539                 :      13394 :       FOR_EACH_VEC_ELT (dependences, i, ddr)
     540                 :            :         {
     541                 :      12727 :           struct data_reference *dra, *drb;
     542                 :            : 
     543                 :            :           /* If the refs are independend there's nothing to do.  */
     544                 :      12727 :           if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
     545                 :       8547 :             continue;
     546                 :       4180 :           dra = DDR_A (ddr);
     547                 :       4180 :           drb = DDR_B (ddr);
     548                 :            :           /* Nothing interesting for the self dependencies.  */
     549                 :       4180 :           if (dra == drb)
     550                 :       1762 :             continue;
     551                 :            : 
     552                 :            :           /* Now check the distance vector, for determining a sensible
     553                 :            :              outer unroll factor, and for validity of merging the inner
     554                 :            :              loop copies.  */
     555                 :       2418 :           if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
     556                 :            :                                      &removed))
     557                 :            :             {
     558                 :            :               /* Couldn't get the distance vector.  For two reads that's
     559                 :            :                  harmless (we assume we should unroll).  For at least
     560                 :            :                  one write this means we can't check the dependence direction
     561                 :            :                  and hence can't determine safety.  */
     562                 :            : 
     563                 :       1121 :               if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
     564                 :            :                 {
     565                 :        129 :                   unroll_factor = 0;
     566                 :        129 :                   break;
     567                 :            :                 }
     568                 :            :             }
     569                 :            :         }
     570                 :            : 
     571                 :            :       /* We regard a user-specified minimum percentage of zero as a request
     572                 :            :          to ignore all profitability concerns and apply the transformation
     573                 :            :          always.  */
     574                 :        796 :       if (!param_unroll_jam_min_percent)
     575                 :         18 :         profit_unroll = MAX(2, profit_unroll);
     576                 :        778 :       else if (removed * 100 / datarefs.length ()
     577                 :        778 :           < (unsigned)param_unroll_jam_min_percent)
     578                 :        568 :         profit_unroll = 1;
     579                 :        796 :       if (unroll_factor > profit_unroll)
     580                 :        626 :         unroll_factor = profit_unroll;
     581                 :        796 :       if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
     582                 :          0 :         unroll_factor = param_unroll_jam_max_unroll;
     583                 :       1657 :       unroll = (unroll_factor > 1
     584                 :        796 :                 && can_unroll_loop_p (outer, unroll_factor, &desc));
     585                 :            : 
     586                 :         65 :       if (unroll)
     587                 :            :         {
     588                 :         65 :           if (dump_enabled_p ())
     589                 :          6 :             dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
     590                 :          6 :                              find_loop_location (outer),
     591                 :            :                              "applying unroll and jam with factor %d\n",
     592                 :            :                              unroll_factor);
     593                 :         65 :           initialize_original_copy_tables ();
     594                 :         65 :           tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
     595                 :            :                             &desc);
     596                 :         65 :           free_original_copy_tables ();
     597                 :         65 :           fuse_loops (outer->inner);
     598                 :         65 :           changed = true;
     599                 :            :         }
     600                 :            : 
     601                 :        796 :       loop_nest.release ();
     602                 :        796 :       free_dependence_relations (dependences);
     603                 :        796 :       free_data_refs (datarefs);
     604                 :            :     }
     605                 :            : 
     606                 :      17678 :   if (changed)
     607                 :            :     {
     608                 :         58 :       scev_reset ();
     609                 :         58 :       free_dominance_info (CDI_DOMINATORS);
     610                 :         58 :       return TODO_cleanup_cfg;
     611                 :            :     }
     612                 :            :   return 0;
     613                 :            : }
     614                 :            : 
     615                 :            : /* Pass boilerplate */
     616                 :            : 
     617                 :            : namespace {
     618                 :            : 
     619                 :            : const pass_data pass_data_loop_jam =
     620                 :            : {
     621                 :            :   GIMPLE_PASS, /* type */
     622                 :            :   "unrolljam", /* name */
     623                 :            :   OPTGROUP_LOOP, /* optinfo_flags */
     624                 :            :   TV_LOOP_JAM, /* tv_id */
     625                 :            :   PROP_cfg, /* properties_required */
     626                 :            :   0, /* properties_provided */
     627                 :            :   0, /* properties_destroyed */
     628                 :            :   0, /* todo_flags_start */
     629                 :            :   0, /* todo_flags_finish */
     630                 :            : };
     631                 :            : 
     632                 :            : class pass_loop_jam : public gimple_opt_pass
     633                 :            : {
     634                 :            : public:
     635                 :     200773 :   pass_loop_jam (gcc::context *ctxt)
     636                 :     401546 :     : gimple_opt_pass (pass_data_loop_jam, ctxt)
     637                 :            :   {}
     638                 :            : 
     639                 :            :   /* opt_pass methods: */
     640                 :     157676 :   virtual bool gate (function *) { return flag_unroll_jam != 0; }
     641                 :            :   virtual unsigned int execute (function *);
     642                 :            : 
     643                 :            : };
     644                 :            : 
     645                 :            : unsigned int
     646                 :      17678 : pass_loop_jam::execute (function *fun)
     647                 :            : {
     648                 :      35356 :   if (number_of_loops (fun) <= 1)
     649                 :            :     return 0;
     650                 :            : 
     651                 :      17678 :   return tree_loop_unroll_and_jam ();
     652                 :            : }
     653                 :            : 
     654                 :            : }
     655                 :            : 
     656                 :            : gimple_opt_pass *
     657                 :     200773 : make_pass_loop_jam (gcc::context *ctxt)
     658                 :            : {
     659                 :     200773 :   return new pass_loop_jam (ctxt);
     660                 :            : }

Generated by: LCOV version 1.0

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto --enable-host-shared. GCC test suite is run with the built compiler.