Skip to content

Commit 4786f8b

Browse files
committed
8369448: C2 SuperWord: refactor VTransform to do move_unordered_reduction_out_of_loop during VTransform::optimize
Reviewed-by: chagedorn, kvn
1 parent a3ee821 commit 4786f8b

File tree

10 files changed

+402
-341
lines changed

10 files changed

+402
-341
lines changed

src/hotspot/share/opto/loopnode.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() {
52875287
}
52885288
}
52895289

5290-
// Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
5291-
if (C->has_loops() && !C->major_progress()) {
5292-
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
5293-
IdealLoopTree* lpt = iter.current();
5294-
if (lpt->is_counted() && lpt->is_innermost()) {
5295-
move_unordered_reduction_out_of_loop(lpt);
5296-
}
5297-
}
5298-
}
5299-
53005290
// Keep loop predicates and perform optimizations with them
53015291
// until no more loop optimizations could be done.
53025292
// After that switch predicates off and do more loop optimizations.

src/hotspot/share/opto/loopnode.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,9 +1550,6 @@ class PhaseIdealLoop : public PhaseTransform {
15501550
IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
15511551
bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
15521552

1553-
// Move an unordered Reduction out of loop if possible
1554-
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
1555-
15561553
// Create a scheduled list of nodes control dependent on ctrl set.
15571554
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
15581555
// Has a use in the vector set

src/hotspot/share/opto/loopopts.cpp

Lines changed: 0 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id
45484548
do_multiversioning(lpt, old_new);
45494549
}
45504550

4551-
// Returns true if the Reduction node is unordered.
4552-
static bool is_unordered_reduction(Node* n) {
4553-
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
4554-
}
4555-
4556-
// Having ReductionNodes in the loop is expensive. They need to recursively
4557-
// fold together the vector values, for every vectorized loop iteration. If
4558-
// we encounter the following pattern, we can vector accumulate the values
4559-
// inside the loop, and only have a single UnorderedReduction after the loop.
4560-
//
4561-
// Note: UnorderedReduction represents a ReductionNode which does not require
4562-
// calculating in strict order.
4563-
//
4564-
// CountedLoop init
4565-
// | |
4566-
// +------+ | +-----------------------+
4567-
// | | | |
4568-
// PhiNode (s) |
4569-
// | |
4570-
// | Vector |
4571-
// | | |
4572-
// UnorderedReduction (first_ur) |
4573-
// | |
4574-
// ... Vector |
4575-
// | | |
4576-
// UnorderedReduction (last_ur) |
4577-
// | |
4578-
// +---------------------+
4579-
//
4580-
// We patch the graph to look like this:
4581-
//
4582-
// CountedLoop identity_vector
4583-
// | |
4584-
// +-------+ | +---------------+
4585-
// | | | |
4586-
// PhiNode (v) |
4587-
// | |
4588-
// | Vector |
4589-
// | | |
4590-
// VectorAccumulator |
4591-
// | |
4592-
// ... Vector |
4593-
// | | |
4594-
// init VectorAccumulator |
4595-
// | | | |
4596-
// UnorderedReduction +-----------+
4597-
//
4598-
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
4599-
// use vector_accumulators, which do the same reductions, but only element
4600-
// wise. This is a single operation per vector_accumulator, rather than many
4601-
// for a UnorderedReduction. We can then reduce the last vector_accumulator
4602-
// after the loop, and also reduce the init value into it.
4603-
//
4604-
// We can not do this with all reductions. Some reductions do not allow the
4605-
// reordering of operations (for example float addition/multiplication require
4606-
// strict order).
4607-
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
4608-
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
4609-
4610-
// Find all Phi nodes with an unordered Reduction on backedge.
4611-
CountedLoopNode* cl = loop->_head->as_CountedLoop();
4612-
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
4613-
Node* phi = cl->fast_out(j);
4614-
// We have a phi with a single use, and an unordered Reduction on the backedge.
4615-
if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
4616-
continue;
4617-
}
4618-
4619-
ReductionNode* last_ur = phi->in(2)->as_Reduction();
4620-
assert(!last_ur->requires_strict_order(), "must be");
4621-
4622-
// Determine types
4623-
const TypeVect* vec_t = last_ur->vect_type();
4624-
uint vector_length = vec_t->length();
4625-
BasicType bt = vec_t->element_basic_type();
4626-
4627-
// Convert opcode from vector-reduction -> scalar -> normal-vector-op
4628-
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
4629-
const int vopc = VectorNode::opcode(sopc, bt);
4630-
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
4631-
DEBUG_ONLY( last_ur->dump(); )
4632-
assert(false, "do not have normal vector op for this reduction");
4633-
continue; // not implemented -> fails
4634-
}
4635-
4636-
// Traverse up the chain of unordered Reductions, checking that it loops back to
4637-
// the phi. Check that all unordered Reductions only have a single use, except for
4638-
// the last (last_ur), which only has phi as a use in the loop, and all other uses
4639-
// are outside the loop.
4640-
ReductionNode* current = last_ur;
4641-
ReductionNode* first_ur = nullptr;
4642-
while (true) {
4643-
assert(!current->requires_strict_order(), "sanity");
4644-
4645-
// Expect no ctrl and a vector_input from within the loop.
4646-
Node* ctrl = current->in(0);
4647-
Node* vector_input = current->in(2);
4648-
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
4649-
DEBUG_ONLY( current->dump(1); )
4650-
assert(false, "reduction has ctrl or bad vector_input");
4651-
break; // Chain traversal fails.
4652-
}
4653-
4654-
assert(current->vect_type() != nullptr, "must have vector type");
4655-
if (current->vect_type() != last_ur->vect_type()) {
4656-
// Reductions do not have the same vector type (length and element type).
4657-
break; // Chain traversal fails.
4658-
}
4659-
4660-
// Expect single use of an unordered Reduction, except for last_ur.
4661-
if (current == last_ur) {
4662-
// Expect all uses to be outside the loop, except phi.
4663-
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
4664-
Node* use = current->fast_out(k);
4665-
if (use != phi && ctrl_or_self(use) == cl) {
4666-
DEBUG_ONLY( current->dump(-1); )
4667-
assert(false, "reduction has use inside loop");
4668-
// Should not be allowed by SuperWord::mark_reductions
4669-
return; // bail out of optimization
4670-
}
4671-
}
4672-
} else {
4673-
if (current->outcnt() != 1) {
4674-
break; // Chain traversal fails.
4675-
}
4676-
}
4677-
4678-
// Expect another unordered Reduction or phi as the scalar input.
4679-
Node* scalar_input = current->in(1);
4680-
if (is_unordered_reduction(scalar_input) &&
4681-
scalar_input->Opcode() == current->Opcode()) {
4682-
// Move up the unordered Reduction chain.
4683-
current = scalar_input->as_Reduction();
4684-
assert(!current->requires_strict_order(), "must be");
4685-
} else if (scalar_input == phi) {
4686-
// Chain terminates at phi.
4687-
first_ur = current;
4688-
current = nullptr;
4689-
break; // Success.
4690-
} else {
4691-
// scalar_input is neither phi nor a matching reduction
4692-
// Can for example be scalar reduction when we have
4693-
// partial vectorization.
4694-
break; // Chain traversal fails.
4695-
}
4696-
}
4697-
if (current != nullptr) {
4698-
// Chain traversal was not successful.
4699-
continue;
4700-
}
4701-
assert(first_ur != nullptr, "must have successfully terminated chain traversal");
4702-
4703-
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
4704-
set_root_as_ctrl(identity_scalar);
4705-
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt);
4706-
register_new_node(identity_vector, C->root());
4707-
assert(vec_t == identity_vector->vect_type(), "matching vector type");
4708-
VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
4709-
4710-
// Turn the scalar phi into a vector phi.
4711-
_igvn.rehash_node_delayed(phi);
4712-
Node* init = phi->in(1); // Remember init before replacing it.
4713-
phi->set_req_X(1, identity_vector, &_igvn);
4714-
phi->as_Type()->set_type(vec_t);
4715-
_igvn.set_type(phi, vec_t);
4716-
4717-
// Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
4718-
current = first_ur;
4719-
while (true) {
4720-
// Create vector_accumulator to replace current.
4721-
Node* last_vector_accumulator = current->in(1);
4722-
Node* vector_input = current->in(2);
4723-
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
4724-
register_new_node(vector_accumulator, cl);
4725-
_igvn.replace_node(current, vector_accumulator);
4726-
VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
4727-
if (current == last_ur) {
4728-
break;
4729-
}
4730-
current = vector_accumulator->unique_out()->as_Reduction();
4731-
assert(!current->requires_strict_order(), "must be");
4732-
}
4733-
4734-
// Create post-loop reduction.
4735-
Node* last_accumulator = phi->in(2);
4736-
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
4737-
4738-
// Take over uses of last_accumulator that are not in the loop.
4739-
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
4740-
Node* use = last_accumulator->out(i);
4741-
if (use != phi && use != post_loop_reduction) {
4742-
assert(ctrl_or_self(use) != cl, "use must be outside loop");
4743-
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
4744-
--i;
4745-
}
4746-
}
4747-
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
4748-
VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
4749-
4750-
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
4751-
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
4752-
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
4753-
}
4754-
}
4755-
47564551
void DataNodeGraph::clone_data_nodes(Node* new_ctrl) {
47574552
for (uint i = 0; i < _data_nodes.size(); i++) {
47584553
clone(_data_nodes[i], new_ctrl);

src/hotspot/share/opto/superword.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
16061606
// 3 instructions (1 shuffle and two reduction ops).
16071607
// However, this optimization assumes that these reductions stay in the loop
16081608
// which may not be true any more in most cases after the introduction of:
1609-
// PhaseIdealLoop::move_unordered_reduction_out_of_loop
1609+
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
16101610
// Hence, this heuristic has room for improvement.
16111611
bool is_two_element_int_or_long_reduction = (size == 2) &&
16121612
(arith_type->basic_type() == T_INT ||
@@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const {
17821782
// This heuristic is a bit simplistic, and assumes that the reduction
17831783
// vector stays in the loop. But in some cases, we can move the
17841784
// reduction out of the loop, replacing it with a single vector op.
1785-
// See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
1785+
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
17861786
// Hence, this heuristic has room for improvement.
17871787
#ifndef PRODUCT
17881788
if (is_trace_superword_rejections()) {
@@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const {
19471947
SuperWordVTransformBuilder builder(_packset, vtransform);
19481948
}
19491949

1950+
vtransform.optimize();
1951+
19501952
if (!vtransform.schedule()) { return false; }
19511953
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
19521954

src/hotspot/share/opto/traceAutoVectorizationTag.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,11 @@
4545
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
4646
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
4747
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
48+
flags(VTRANSFORM, "Trace VTransform Graph") \
49+
flags(OPTIMIZATION, "Trace VTransform::optimize") \
4850
flags(ALIGN_VECTOR, "Trace AlignVector") \
4951
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
5052
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
51-
flags(VTRANSFORM, "Trace VTransform Graph") \
5253
flags(ALL, "Trace everything (very verbose)")
5354

5455
#define table_entry(name, description) name,

src/hotspot/share/opto/vectorization.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ class VLoop : public StackObj {
205205
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
206206
}
207207

208+
bool is_trace_optimization() const {
209+
return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
210+
}
211+
208212
bool is_trace_speculative_runtime_checks() const {
209213
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
210214
}

0 commit comments

Comments
 (0)