Message ID | 1369339315.21561.42.camel@surprise |
---|---|
State | New |
Headers | show |
> /* The Ith entry is the number of objects on a page or order I. */ > > -static unsigned objects_per_page_table[NUM_ORDERS]; > +DEFINE_STATIC_STATE_ARRAY(unsigned, objects_per_page_table, NUM_ORDERS) > > /* The Ith entry is the size of an object on a page of order I. */ > > -static size_t object_size_table[NUM_ORDERS]; > +DEFINE_STATIC_STATE_ARRAY(size_t, object_size_table, NUM_ORDERS) > > /* The Ith entry is a pair of numbers (mult, shift) such that > ((k * mult) >> shift) mod 2^32 == (k / OBJECT_SIZE(I)) mod 2^32, > for all k evenly divisible by OBJECT_SIZE(I). */ > > -static struct > +struct inverse_table_def > { > size_t mult; > unsigned int shift; > -} > -inverse_table[NUM_ORDERS]; > +}; > +DEFINE_STATIC_STATE_ARRAY(inverse_table_def, inverse_table, NUM_ORDERS) > > /* A page_entry records the status of an allocation page. This > structure is dynamically sized to fit the bitmap in_use_p. */ > @@ -343,7 +346,7 @@ struct free_object > #endif > > /* The rest of the global variables. */ > -static struct globals > +struct globals > { > /* The Nth element in this array is a page with objects of size 2^N. > If there are any pages with free objects, they will be at the > @@ -457,7 +460,11 @@ static struct globals > /* The overhead for each of the allocation orders. */ > unsigned long long total_overhead_per_order[NUM_ORDERS]; > } stats; > -} G; > +}; > + > +DEFINE_STATIC_STATE(globals, G) Be careful here. Note that all of the arrays that you convert are notionally static constants defining the bucket sizes for the algorithm, except we don't actually compute them at compile time. They would be true global data shared by all threads. The G structure contains all of the data that you'd actually want to put into the "universe". That said, I'd really like to avoid ultra-macroization as with > +#define probability_cutoff \ > + universe.tracer_c->x_probability_cutoff > +#define branch_ratio_cutoff \ > + universe.tracer_c->x_branch_ratio_cutoff > +#define bb_seen \ > + universe.tracer_c->x_bb_seen if we can avoid it. I think we need more weigh in from other maintainers on this, rather than iterating a 5th time today... r~
On 05/23/2013 02:31 PM, Richard Henderson wrote: > > I think we need more weigh in from other maintainers on this, rather than > iterating a 5th time today... This seems like an awful lot of pain. I don't think we should be looking to generate different code for library vs executable GCC. I think we should look for *clean* first, then we can look at what we could change if the compile-time performance isn't what we want. Lots of C++ code manages to pass around the implicit this pointer and use it appropriately without that being a significant source of performance concerns. I suspect GCC would be the same in that regard. The cost of passing around & using that pointer is dwarfed by all the other lameness we have. Jeff
On Thu, 2013-05-23 at 13:31 -0700, Richard Henderson wrote: > > /* The Ith entry is the number of objects on a page or order I. */ > > > > -static unsigned objects_per_page_table[NUM_ORDERS]; > > +DEFINE_STATIC_STATE_ARRAY(unsigned, objects_per_page_table, NUM_ORDERS) > > > > /* The Ith entry is the size of an object on a page of order I. */ > > > > -static size_t object_size_table[NUM_ORDERS]; > > +DEFINE_STATIC_STATE_ARRAY(size_t, object_size_table, NUM_ORDERS) > > > > /* The Ith entry is a pair of numbers (mult, shift) such that > > ((k * mult) >> shift) mod 2^32 == (k / OBJECT_SIZE(I)) mod 2^32, > > for all k evenly divisible by OBJECT_SIZE(I). */ > > > > -static struct > > +struct inverse_table_def > > { > > size_t mult; > > unsigned int shift; > > -} > > -inverse_table[NUM_ORDERS]; > > +}; > > +DEFINE_STATIC_STATE_ARRAY(inverse_table_def, inverse_table, NUM_ORDERS) > > > > /* A page_entry records the status of an allocation page. This > > structure is dynamically sized to fit the bitmap in_use_p. */ > > @@ -343,7 +346,7 @@ struct free_object > > #endif > > > > /* The rest of the global variables. */ > > -static struct globals > > +struct globals > > { > > /* The Nth element in this array is a page with objects of size 2^N. > > If there are any pages with free objects, they will be at the > > @@ -457,7 +460,11 @@ static struct globals > > /* The overhead for each of the allocation orders. */ > > unsigned long long total_overhead_per_order[NUM_ORDERS]; > > } stats; > > -} G; > > +}; > > + > > +DEFINE_STATIC_STATE(globals, G) > > Be careful here. Note that all of the arrays that you convert are notionally > static constants defining the bucket sizes for the algorithm, except we don't > actually compute them at compile time. They would be true global data shared > by all threads. I know - but when you have a big hammer, sometimes it's easier just to hit all of the nails, rather than just the ones you need to :) [...]
On Thu, May 23, 2013 at 02:44:48PM -0600, Jeff Law wrote: > On 05/23/2013 02:31 PM, Richard Henderson wrote: > >I think we need more weigh in from other maintainers on this, rather than > >iterating a 5th time today... > This seems like an awful lot of pain. > > I don't think we should be looking to generate different code for > library vs executable GCC. > > I think we should look for *clean* first, then we can look at what > we could change if the compile-time performance isn't what we want. > > Lots of C++ code manages to pass around the implicit this pointer > and use it appropriately without that being a significant source of > performance concerns. I suspect GCC would be the same in that > regard. The cost of passing around & using that pointer is dwarfed > by all the other lameness we have. I'm afraid we don't have the luxury of slowing the compiler too much. Anyway, I don't see how a single this would help with all the global state, because there are various levels of global state. The tracer changes show just the easiest one, non-GTY pass that that is internal to the file, starts living at the start of the pass and can be forgotten at the end of the pass. But, often pass has some of its global state, and calls functions from other files that access different global state (cfun, crtl, current_function_decl, lots of other things), some files have global state preserved across multiple passes, etc. Before we start changing anything, we need a firm plan for everything, otherwise we end up with a useless partial transformation. Some global state data is accessed only occassionally, but other is accessed all the time (cfun being a very good example here). Jakub
On 05/23/2013 02:59 PM, Jakub Jelinek wrote: > On Thu, May 23, 2013 at 02:44:48PM -0600, Jeff Law wrote: >> On 05/23/2013 02:31 PM, Richard Henderson wrote: >>> I think we need more weigh in from other maintainers on this, rather than >>> iterating a 5th time today... >> This seems like an awful lot of pain. >> >> I don't think we should be looking to generate different code for >> library vs executable GCC. >> >> I think we should look for *clean* first, then we can look at what >> we could change if the compile-time performance isn't what we want. >> >> Lots of C++ code manages to pass around the implicit this pointer >> and use it appropriately without that being a significant source of >> performance concerns. I suspect GCC would be the same in that >> regard. The cost of passing around & using that pointer is dwarfed >> by all the other lameness we have. > > I'm afraid we don't have the luxury of slowing the compiler too much. Nor do we have the luxury of continuing to not deal with these long term issues. Nor do we have the luxury of creating something so (*&@#$ ugly that nobody is willing to work on it again in the future. Particularly when there's no evidence it's going to be measurably slower. > > Anyway, I don't see how a single this would help with all the global state, > because there are various levels of global state. The tracer changes show > just the easiest one, non-GTY pass that that is internal to the file, > starts living at the start of the pass and can be forgotten at the end of > the pass. Agreed, but we need to start somewhere and passes of this nature seem like a reasonable place to me But, often pass has some of its global state, and calls functions > from other files that access different global state (cfun, crtl, > current_function_decl, lots of other things), some files have global state > preserved across multiple passes, etc. Before we start changing anything, > we need a firm plan for everything, otherwise we end up with a useless > partial transformation. Some global state data is accessed only > occassionally, but other is accessed all the time (cfun being a very good > example here). I don't disagree in principle WRT partial transformations. But I also believe that there is value in cleaning up the the simple stuff, even in isolation. To take your specific examples: * Global state is global state needs to be available via a global context pointer of some kind regardless of how we handle pass-local data. * pass-local state that spans passes. I'd like to see an example, but my gut feeling is such things are probably going to either become global state or die. They'll have to be handled on a case by case basis. In both cases, David's changes don't make those problems any easier or any harder. He merely encapsulates pass-local data where it's easy to do so right now. It's just a cleaner design/implementation and I'd like to see it pursued. Jeff
On Thu, May 23, 2013 at 11:42 PM, Jeff Law <law@redhat.com> wrote: > On 05/23/2013 02:59 PM, Jakub Jelinek wrote: >> >> On Thu, May 23, 2013 at 02:44:48PM -0600, Jeff Law wrote: >>> >>> On 05/23/2013 02:31 PM, Richard Henderson wrote: >>>> >>>> I think we need more weigh in from other maintainers on this, rather >>>> than >>>> iterating a 5th time today... >>> >>> This seems like an awful lot of pain. >>> >>> I don't think we should be looking to generate different code for >>> library vs executable GCC. >>> >>> I think we should look for *clean* first, then we can look at what >>> we could change if the compile-time performance isn't what we want. >>> >>> Lots of C++ code manages to pass around the implicit this pointer >>> and use it appropriately without that being a significant source of >>> performance concerns. I suspect GCC would be the same in that >>> regard. The cost of passing around & using that pointer is dwarfed >>> by all the other lameness we have. >> >> >> I'm afraid we don't have the luxury of slowing the compiler too much. > > Nor do we have the luxury of continuing to not deal with these long term > issues. Nor do we have the luxury of creating something so (*&@#$ ugly that > nobody is willing to work on it again in the future. Particularly when > there's no evidence it's going to be measurably slower. > > > >> >> Anyway, I don't see how a single this would help with all the global >> state, >> because there are various levels of global state. The tracer changes show >> just the easiest one, non-GTY pass that that is internal to the file, >> starts living at the start of the pass and can be forgotten at the end of >> the pass. > > Agreed, but we need to start somewhere and passes of this nature seem like a > reasonable place to me > > > > But, often pass has some of its global state, and calls functions >> >> from other files that access different global state (cfun, crtl, >> current_function_decl, lots of other things), some files have global state >> preserved across multiple passes, etc. Before we start changing anything, >> we need a firm plan for everything, otherwise we end up with a useless >> partial transformation. Some global state data is accessed only >> occassionally, but other is accessed all the time (cfun being a very good >> example here). > > I don't disagree in principle WRT partial transformations. But I also > believe that there is value in cleaning up the the simple stuff, even in > isolation. > > To take your specific examples: > > * Global state is global state needs to be available via > a global context pointer of some kind regardless of how we > handle pass-local data. > > * pass-local state that spans passes. I'd like to see an > example, but my gut feeling is such things are probably > going to either become global state or die. They'll have > to be handled on a case by case basis. > > In both cases, David's changes don't make those problems any easier or any > harder. He merely encapsulates pass-local data where it's easy to do so > right now. It's just a cleaner design/implementation and I'd like to see it > pursued. Without looking at the actual patch in question I'd say that changing things in a way that is clearly defined where we'd be thread safe is a good thing. For example after the frontend finished, or after the point where LTO would have run, or for all IPA analysis/transform phases (who usually iterate over all function bodies). Ideally we'd be able to work on multiple functions at once. The very first step in that direction would be to kill current_pass and have the pass manager pass in context (containing the struct function to work on for example, replacing uses of cfun) to the execute function of scalar passes. And then just conentrate on getting rid of (implicit) 'cfun' uses in passes (let them flag themselves with "I'm safe w/o cfun" and have the pass manager just NULL that). A lot of interfaces got a _fn variant already. Then of course interesting things such as dumpfiles pop up - a pass writes to a single dump-file, so multiple instances of that pass cannot run in parallel without disrupting that. A way out is to run in a pipelining mode instead, thus have only a single pass instance at a given time but have multiple functions being processed by different passes in parallel. That even avoids the need to cleanup state in passes themselves, it just needs caretaking of the global cfun like state. Richard. > Jeff
diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 63d114b..5f1bc0c 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1461,6 +1461,7 @@ OBJS = \ tree-vectorizer.o \ tree-vrp.o \ tree.o \ + universe.o \ valtrack.o \ value-prof.o \ var-tracking.o \ diff --git a/gcc/genstate.py b/gcc/genstate.py new file mode 100644 index 0000000..503c07d --- /dev/null +++ b/gcc/genstate.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +from collections import namedtuple +import re +import sys + +open_paren = r'\(' +close_paren = r'\)' +opt_ws = r'\s*' +grp_tokens = '(.+)' + +class StaticState(namedtuple('StaticState', ('kind', 'name'))): + PATTERN = \ + ('DEFINE_STATIC_STATE' + open_paren + opt_ws + + grp_tokens + opt_ws + ',' + opt_ws + + grp_tokens + opt_ws + close_paren) + pattern = re.compile(PATTERN) + + def write_struct_field(self, f): + f.write(' %s x_%s;\n' % (self.kind, self.name)) + +class StaticStateArray(namedtuple('StaticStateArray', ('kind', 'name', 'nelt'))): + PATTERN = \ + ('DEFINE_STATIC_STATE_ARRAY' + open_paren + opt_ws + + grp_tokens + opt_ws + ',' + opt_ws + + grp_tokens + opt_ws + ',' + opt_ws + + grp_tokens + opt_ws + close_paren) + pattern = re.compile(PATTERN) + + def write_struct_field(self, f): + f.write(' %s x_%s[%s];\n' % (self.kind, self.name, self.nelt)) + +class SourceFile: + def __init__(self, path): + self.path = path + self.vars = [] + with open(path) as f: + for line in f: + m = StaticState.pattern.match(line) + if m: + self.vars.append(StaticState(*m.groups())) + m = StaticStateArray.pattern.match(line) + if m: + self.vars.append(StaticStateArray(*m.groups())) + + @property + def struct_name(self): + return 'state_%s_def' % self.field_name + + @property + def field_name(self): + """ + Get the fieldname for this source file within (struct universe_def) + """ + result = self.path + for ch in '.-': + result = result.replace(ch, '_') + return result + + def write_struct(self, f): + f.write('struct %s\n' % self.struct_name) + f.write('{\n') + for var in self.vars: + var.write_struct_field(f) + f.write('};\n') + + def write_macros(self, f): + for var in self.vars: + f.write('#define %s \\\n' % var.name) + f.write(' universe.%s->x_%s\n' % (self.field_name, var.name)) + + def write_sizeof_decl(self, f): + f.write('extern size_t get_sizeof_%s ();\n' % self.struct_name) + + def write_sizeof_impl(self, f): + f.write('size_t get_sizeof_%s ()\n' % self.struct_name) + f.write('{\n') + f.write(' return sizeof (%s);\n' % self.struct_name) + f.write('}\n') + + def write_private_header(self, f): + warn_autogenerated(f) + f.write('#if !GLOBAL_STATE\n') + sf.write_struct(f) + f.write('\n') + sf.write_sizeof_impl(f) + f.write('\n') + sf.write_macros(f) + f.write('\n') + f.write('#endif /* !GLOBAL_STATE */\n') + +def warn_autogenerated(f): + f.write('/* This file is autogenerated (by genstate.py);' + ' do not edit! */\n') + +def write_universe_header(sfs, f): + warn_autogenerated(f) + f.write('#if !GLOBAL_STATE\n') + f.write('struct universe_def\n') + f.write('{\n') + for sf in sfs: + f.write(' struct %s *%s;\n' % (sf.struct_name, sf.field_name)) + f.write('};\n\n') + f.write('/* Functions for allocating state: */\n') + for sf in sfs: + sf.write_sizeof_decl(f); + f.write('#endif /* !GLOBAL_STATE */\n') + +sfs = [] +for path in ['tracer.c', 'ggc-page.c']: + sf = SourceFile(path) + sfs.append(sf) + with open(path + '.state.h', 'w') as f: + sf.write_private_header(f) + sf.write_private_header(sys.stdout) +with open('universe.h', 'w') as f: + write_universe_header(sfs, f) + write_universe_header(sfs, sys.stdout) diff --git a/gcc/ggc-page.c b/gcc/ggc-page.c index 5b18468..ba2875e 100644 --- a/gcc/ggc-page.c +++ b/gcc/ggc-page.c @@ -224,24 +224,27 @@ static const size_t extra_order_size_table[] = { #define PAGE_ALIGN(x) (((x) + G.pagesize - 1) & ~(G.pagesize - 1)) +#include "global-state.h" +#include "universe.h" + /* The Ith entry is the number of objects on a page or order I. */ -static unsigned objects_per_page_table[NUM_ORDERS]; +DEFINE_STATIC_STATE_ARRAY(unsigned, objects_per_page_table, NUM_ORDERS) /* The Ith entry is the size of an object on a page of order I. */ -static size_t object_size_table[NUM_ORDERS]; +DEFINE_STATIC_STATE_ARRAY(size_t, object_size_table, NUM_ORDERS) /* The Ith entry is a pair of numbers (mult, shift) such that ((k * mult) >> shift) mod 2^32 == (k / OBJECT_SIZE(I)) mod 2^32, for all k evenly divisible by OBJECT_SIZE(I). */ -static struct +struct inverse_table_def { size_t mult; unsigned int shift; -} -inverse_table[NUM_ORDERS]; +}; +DEFINE_STATIC_STATE_ARRAY(inverse_table_def, inverse_table, NUM_ORDERS) /* A page_entry records the status of an allocation page. This structure is dynamically sized to fit the bitmap in_use_p. */ @@ -343,7 +346,7 @@ struct free_object #endif /* The rest of the global variables. */ -static struct globals +struct globals { /* The Nth element in this array is a page with objects of size 2^N. If there are any pages with free objects, they will be at the @@ -457,7 +460,11 @@ static struct globals /* The overhead for each of the allocation orders. */ unsigned long long total_overhead_per_order[NUM_ORDERS]; } stats; -} G; +}; + +DEFINE_STATIC_STATE(globals, G) + +#include "ggc-page.c.state.h" /* The size in bytes required to maintain a bitmap for the objects on a page-entry. */ diff --git a/gcc/ggc-page.c.state.h b/gcc/ggc-page.c.state.h new file mode 100644 index 0000000..b359421 --- /dev/null +++ b/gcc/ggc-page.c.state.h @@ -0,0 +1,25 @@ +/* This file is autogenerated (by genstate.py); do not edit! */ +#if !GLOBAL_STATE +struct state_ggc_page_c_def +{ + unsigned x_objects_per_page_table[NUM_ORDERS]; + size_t x_object_size_table[NUM_ORDERS]; + inverse_table_def x_inverse_table[NUM_ORDERS]; + globals x_G; +}; + +size_t get_sizeof_state_ggc_page_c_def () +{ + return sizeof (state_ggc_page_c_def); +} + +#define objects_per_page_table \ + universe.ggc_page_c->x_objects_per_page_table +#define object_size_table \ + universe.ggc_page_c->x_object_size_table +#define inverse_table \ + universe.ggc_page_c->x_inverse_table +#define G \ + universe.ggc_page_c->x_G + +#endif /* !GLOBAL_STATE */ diff --git a/gcc/global-state.h b/gcc/global-state.h new file mode 100644 index 0000000..d317897 --- /dev/null +++ b/gcc/global-state.h @@ -0,0 +1,19 @@ +/* Crude testing hack for switching between: + global state + vs + per-universe state + This would be a config option controlling the whole build, so that + you'd use the former for a standalone build of gcc, and the latter + when building the code for use as a dynamic library. */ +#define GLOBAL_STATE 0 + +#if GLOBAL_STATE +#define DEFINE_STATIC_STATE(TYPE, NAME) static TYPE NAME; +#define DEFINE_STATIC_STATE_ARRAY(TYPE, NAME, NELT) static TYPE NAME[NELT]; +#else +#define DEFINE_STATIC_STATE(TYPE, NAME) +#define DEFINE_STATIC_STATE_ARRAY(TYPE, NAME, NELT) + +extern struct universe_def universe; +#endif + diff --git a/gcc/tracer.c b/gcc/tracer.c index 975cadb..c346bf1 100644 --- a/gcc/tracer.c +++ b/gcc/tracer.c @@ -57,13 +57,18 @@ static edge find_best_successor (basic_block); static edge find_best_predecessor (basic_block); static int find_trace (basic_block, basic_block *); +#include "global-state.h" +#include "universe.h" + /* Minimal outgoing edge probability considered for superblock formation. */ -static int probability_cutoff; -static int branch_ratio_cutoff; +DEFINE_STATIC_STATE(int, probability_cutoff) +DEFINE_STATIC_STATE(int, branch_ratio_cutoff) /* A bit BB->index is set if BB has already been seen, i.e. it is connected to some trace already. */ -sbitmap bb_seen; +DEFINE_STATIC_STATE(sbitmap, bb_seen) + +#include "tracer.c.state.h" static inline void mark_bb_seen (basic_block bb) diff --git a/gcc/tracer.c.state.h b/gcc/tracer.c.state.h new file mode 100644 index 0000000..95ccc11 --- /dev/null +++ b/gcc/tracer.c.state.h @@ -0,0 +1,22 @@ +/* This file is autogenerated (by genstate.py); do not edit! */ +#if !GLOBAL_STATE +struct state_tracer_c_def +{ + int x_probability_cutoff; + int x_branch_ratio_cutoff; + sbitmap x_bb_seen; +}; + +size_t get_sizeof_state_tracer_c_def () +{ + return sizeof (state_tracer_c_def); +} + +#define probability_cutoff \ + universe.tracer_c->x_probability_cutoff +#define branch_ratio_cutoff \ + universe.tracer_c->x_branch_ratio_cutoff +#define bb_seen \ + universe.tracer_c->x_bb_seen + +#endif /* !GLOBAL_STATE */ diff --git a/gcc/universe.c b/gcc/universe.c new file mode 100644 index 0000000..7a43851 --- /dev/null +++ b/gcc/universe.c @@ -0,0 +1,7 @@ +#include "config.h" +#include "system.h" +#include "global-state.h" +#include "universe.h" + +struct universe_def universe; + diff --git a/gcc/universe.h b/gcc/universe.h new file mode 100644 index 0000000..969b415 --- /dev/null +++ b/gcc/universe.h @@ -0,0 +1,15 @@ +/* This file is autogenerated (by genstate.py); do not edit! */ +#define GLOBAL_STATE 0 +#if !GLOBAL_STATE +struct universe_def +{ + struct state_tracer_c_def *tracer_c; + struct state_ggc_page_c_def *ggc_page_c; +}; + +extern struct universe_def universe; + +/* Functions for allocating state: */ +extern size_t get_sizeof_state_tracer_c_def (); +extern size_t get_sizeof_state_ggc_page_c_def (); +#endif /* !GLOBAL_STATE */