diff mbox

Fwd: [RFC][gomp4] Offloading patches (2/3): Add tables generation

Message ID 530648F8.2010409@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt Feb. 20, 2014, 6:27 p.m. UTC
Okay, so given the resistance to a unique-string scheme I went back and 
adapted the mechanism from your patches for ptx. I guess we can use that 
for now; if and when it breaks I have something ready to replace it.

There were still a number of things in these patches that did not make 
sense to me and which I've changed. Let me know if there was a good 
reason for the way some of these things were originally done.
  * Functions and variables now go into different tables, otherwise
    intermixing between them could be a problem that causes tables to
    go out of sync between host and target (imagine one big table being
    generated by ptx lto1/mkoffload, and multiple small table fragments
    being linked together on the host side).
  * I've put the begin/end fragments for the host tables into crtstuff,
    which seems like the standard way of doing things.
  * Changed the generation of tables to lower the alignment so that
    there are no gaps when linking together multiple files.
  * All the target-specific image-generating code is moved out of
    lto-wrapper into a mkoffload tool.
  * Is there a reason to call a register function for the host tables?
    The way I've set it up, we register a target function/variable table
    while also passing a pointer to the __OPENMP_TARGET__ symbol which
    holds information about the host side tables.
  * There aren't any named sections in ptx, so I've added another target
    hook to write out the map.
  * An offload compiler is built with --enable-as-accelerator-for=, which
    eliminates the need for -fopenmp-target, and changes install paths so
    that the host compiler knows where to find it. No need for
    OFFLOAD_TARGET_COMPILERS anymore.

I'll still need to add gcc driver support to automatically set the 
OFFLOAD_TARGET_NAMES variable from the accelerator targets that were 
configured in. Currently that still needs to be set manually for testing.

I'm appending those parts of my current patch kit that seem relevant. 
This includes the ptx mkoffload tool and a patch to make a dummy 
GOMP_offload_register function. Most of the others are updated versions 
of patches I've posted before, and two adapted from Michael Zolotukhin's 
set (automatically generated files not included in the diffs for size 
reasons). How does this look?


Bernd
/* Offload image generation tool for ptx

   Nathan Sidwell <nathan@codesourcery.com>
   Bernd Schmidt <bernds@codesourcery.com>

   Munges PTX assembly into a C source file defining the PTX code as a
   string.

   This is not a complete assembler.  We presume the source is well
   formed from the compiler and can die horribly if it is not.  */

#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "intl.h"
#include <libgen.h>
#include "obstack.h"

#define COMMENT_PREFIX "#"

typedef enum Kind
{
  /* 0-ff used for single char tokens */
  K_symbol = 0x100, /* a symbol */
  K_label,  /* a label defn (i.e. symbol:) */
  K_ident,  /* other ident */
  K_dotted, /* dotted identifier */
  K_number,
  K_string,
  K_comment
} Kind;

typedef struct Token
{
  unsigned short kind : 12;
  unsigned short space : 1; /* preceded by space */
  unsigned short end : 1;   /* succeeded by end of line */
  /* Length of token */
  unsigned short len;

  /* Token itself */
  char const *ptr;
} Token;

/* statement info */
typedef enum Vis
{
  V_dot = 0,  /* random pseudo */
  V_var = 1,  /* var decl/defn */
  V_func = 2, /* func decl/defn */
  V_insn = 3, /* random insn */
  V_label = 4, /* label defn */
  V_comment = 5,
  V_pred = 6,  /* predicate */
  V_mask = 0x7,
  V_global = 0x08, /* globalize */
  V_weak = 0x10,   /* weakly globalize */
  V_no_eol = 0x20, /* no end of line */
  V_prefix_comment = 0x40 /* prefixed comment */
} Vis;

typedef struct Stmt
{
  struct Stmt *next;
  Token *tokens;
  unsigned char vis;
  unsigned len : 12;
  unsigned sym : 12;
} Stmt;

struct id_map
{
  id_map *next;
  char *ptx_name;
};

static const char *read_file (FILE *);
static Token *tokenize (const char *);

static void write_token (FILE *, const Token *);
static void write_tokens (FILE *, const Token *, unsigned, int);

static Stmt *alloc_stmt (unsigned, Token *, Token *, const Token *);
#define alloc_comment(S,E) alloc_stmt (V_comment, S, E, 0)
#define append_stmt(V, S) ((S)->next = *(V), *(V) = (S))
static Stmt *rev_stmts (Stmt *);
static void write_stmt (FILE *, const Stmt *);
static void write_stmts (FILE *, const Stmt *);

static Token *parse_insn (Token *);
static Token *parse_list_nosemi (Token *);
static Token *parse_init (Token *);
static Token *parse_file (Token *);

static Stmt *decls;
static Stmt *vars;
static Stmt *fns;

static id_map *func_ids, **funcs_tail = &func_ids;
static id_map *var_ids, **vars_tail = &var_ids;

int debug = 1;				/* true if -save-temps.  */
int verbose = 1;				/* true if -v.  */

static const char *args_name;

/* Add or change the value of an environment variable, outputting the
   change to standard error if in verbose mode.  */
static void
xputenv (const char *string)
{
  if (verbose)
    fprintf (stderr, "%s\n", string);
  putenv (CONST_CAST (char *, string));
}

static void maybe_unlink_file (const char *);

/* Delete tempfiles.  */

static void
lto_wrapper_cleanup (void)
{
  static bool cleanup_done = false;

  if (cleanup_done)
    return;

  /* Setting cleanup_done prevents an infinite loop if one of the
     calls to maybe_unlink_file fails. */
  cleanup_done = true;

  if (args_name)
    maybe_unlink_file (args_name);
}

/* Die when sys call fails. CMSGID is the error message.  */

static void __attribute__ ((format (printf, 1, 2)))
fatal_perror (const char *cmsgid, ...)
{
  int e = errno;
  va_list ap;

  va_start (ap, cmsgid);
  fprintf (stderr, "mkoffload: ");
  vfprintf (stderr, _(cmsgid), ap);
  fprintf (stderr, ": %s\n", xstrerror (e));
  va_end (ap);

  lto_wrapper_cleanup ();
  exit (FATAL_EXIT_CODE);
}

/* Just die. CMSGID is the error message. */

static void __attribute__ ((format (printf, 1, 2)))
fatal (const char * cmsgid, ...)
{
  va_list ap;

  va_start (ap, cmsgid);
  fprintf (stderr, "mkoffload: ");
  vfprintf (stderr, _(cmsgid), ap);
  fprintf (stderr, "\n");
  va_end (ap);

  exit (FATAL_EXIT_CODE);
}

/* Execute a program, and wait for the reply. ARGV are the arguments. The
   last one must be NULL. */

static struct pex_obj *
collect_execute (char **argv)
{
  struct pex_obj *pex;
  const char *errmsg;
  int err;

  if (verbose)
    {
      char **p_argv;
      const char *str;

      for (p_argv = argv; (str = *p_argv) != (char *) 0; p_argv++)
	fprintf (stderr, " %s", str);

      fprintf (stderr, "\n");
    }

  fflush (stdout);
  fflush (stderr);

  pex = pex_init (0, "mkoffload", NULL);
  if (pex == NULL)
    fatal_perror ("pex_init failed");

  /* Do not use PEX_LAST here, we use our stdout for communicating with
     collect2 or the linker-plugin.  Any output from the sub-process
     will confuse that.  */
  errmsg = pex_run (pex, PEX_SEARCH, argv[0], argv, NULL,
		    NULL, &err);
  if (errmsg != NULL)
    {
      if (err != 0)
	{
	  errno = err;
	  fatal_perror (errmsg);
	}
      else
	fatal (errmsg);
    }

  return pex;
}

/* Wait for a process to finish, and exit if a nonzero status is found.
   PROG is the program name. PEX is the process we should wait for. */

static int
collect_wait (const char *prog, struct pex_obj *pex)
{
  int status;

  if (!pex_get_status (pex, 1, &status))
    fatal_perror ("can't get program status");
  pex_free (pex);

  if (status)
    {
      if (WIFSIGNALED (status))
	{
	  int sig = WTERMSIG (status);
	  if (WCOREDUMP (status))
	    fatal ("%s terminated with signal %d [%s], core dumped",
		   prog, sig, strsignal (sig));
	  else
	    fatal ("%s terminated with signal %d [%s]",
		   prog, sig, strsignal (sig));
	}

      if (WIFEXITED (status))
	fatal ("%s returned %d exit status", prog, WEXITSTATUS (status));
    }

  return 0;
}

/* Unlink a temporary LTRANS file unless requested otherwise.  */

static void
maybe_unlink_file (const char *file)
{
  if (! debug)
    {
      if (unlink_if_ordinary (file)
	  && errno != ENOENT)
	fatal_perror ("deleting file %s", file);
    }
  else
    fprintf (stderr, "[Leaving %s]\n", file);
}

/* Execute program ARGV[0] with arguments ARGV. Wait for it to finish.  */

static void
fork_execute (char **argv)
{
  struct pex_obj *pex;
  char *new_argv[3];
  char *at_args;
  FILE *args;
  int status;

  args_name = make_temp_file (".args");
  at_args = concat ("@", args_name, NULL);
  args = fopen (args_name, "w");
  if (args == NULL)
    fatal ("failed to open %s", args_name);

  status = writeargv (&argv[1], args);

  if (status)
    fatal ("could not write to temporary file %s",  args_name);

  fclose (args);

  new_argv[0] = argv[0];
  new_argv[1] = at_args;
  new_argv[2] = NULL;

  pex = collect_execute (new_argv);
  collect_wait (new_argv[0], pex);

  maybe_unlink_file (args_name);
  args_name = NULL;
  free (at_args);
}

static void
record_id (const char *p1, id_map ***where)
{
  const char *end = strchr (p1, '\n');
  if (!end)
    fatal ("malformed ptx file");

  id_map *v = XNEW (id_map);
  size_t len = end - p1;
  v->ptx_name = XNEWVEC (char, len + 1);
  memcpy (v->ptx_name, p1, len);
  v->ptx_name[len] = '\0';
  v->next = NULL;
  id_map **tail = *where;
  *tail = v;
  *where = &v->next;
}

/* Read the whole input file.  It will be NUL terminated (but
   remember, there could be a NUL in the file itself.  */

static const char *
read_file (FILE *stream)
{
  size_t alloc = 16384;
  size_t base = 0;
  char *buffer;

  if (!fseek (stream, 0, SEEK_END))
    {
      /* Get the file size.  */
      long s = ftell (stream);
      if (s >= 0)
	alloc = s + 100;
      fseek (stream, 0, SEEK_SET);
    }
  buffer = XNEWVEC (char, alloc);

  for (;;)
    {
      size_t n = fread (buffer + base, 1, alloc - base - 1, stream);

      if (!n)
	break;
      base += n;
      if (base + 1 == alloc)
	{
	  alloc *= 2;
	  buffer = XRESIZEVEC (char, buffer, alloc);
	}
    }
  buffer[base] = 0;
  return buffer;
}

/* Read a token, advancing ptr.
   If we read a comment, append it to the comments block. */

static Token *
tokenize (const char *ptr)
{
  unsigned alloc = 1000;
  unsigned num = 0;
  Token *toks = XNEWVEC (Token, alloc);
  int in_comment = 0;
  int not_comment = 0;

  for (;; num++)
    {
      const char *base;
      unsigned kind;
      int ws = 0;
      int eol = 0;

    again:
      base = ptr;
      if (in_comment)
	goto block_comment;
      switch (kind = *ptr++)
	{
	default:
	  break;

	case '\n':
	  eol = 1;
	  /* Fall through */
	case ' ':
	case '\t':
	case '\r':
	case '\v':
	  /* White space */
	  ws = not_comment;
	  goto again;

	case '/':
	  {
	    if (*ptr == '/')
	      {
		/* line comment.  Do not include trailing \n */
		base += 2;
		for (; *ptr; ptr++)
		  if (*ptr == '\n')
		    break;
		kind = K_comment;
	      }
	    else if (*ptr == '*')
	      {
		/* block comment */
		base += 2;
		ptr++;

	      block_comment:
		eol = in_comment;
		in_comment = 1;
		for (; *ptr; ptr++)
		  {
		    if (*ptr == '\n')
		      {
			ptr++;
			break;
		      }
		    if (ptr[0] == '*' && ptr[1] == '/')
		      {
			in_comment = 2;
			ptr += 2;
			break;
		      }
		  }
		kind = K_comment;
	      }
	    else
	      break;
	  }
	  break;

	case '"':
	  /* quoted string */
	  kind = K_string;
	  while (*ptr)
	    if (*ptr == '"')
	      {
		ptr++;
		break;
	      }
	    else if (*ptr++ == '\\')
	      ptr++;
	  break;

	case '.':
	  if (*ptr < '0' || *ptr > '9')
	    {
	      kind = K_dotted;
	      ws = not_comment;
	      goto ident;
	    }
	  /* FALLTHROUGH */
	case '0'...'9':
	  kind = K_number;
	  goto ident;
	  break;

	case '$':  /* local labels.  */
	case '%':  /* register names, pseudoes etc */
	  kind = K_ident;
	  goto ident;

	case 'a'...'z':
	case 'A'...'Z':
	case '_':
	  kind = K_symbol; /* possible symbol name */
	ident:
	  for (; *ptr; ptr++)
	    {
	      if (*ptr >= 'A' && *ptr <= 'Z')
		continue;
	      if (*ptr >= 'a' && *ptr <= 'z')
		continue;
	      if (*ptr >= '0' && *ptr <= '9')
		continue;
	      if (*ptr == '_' || *ptr == '$')
		continue;
	      if (*ptr == '.' && kind != K_dotted)
		/* Idents starting with a dot, cannot have internal dots. */
		continue;
	      if ((*ptr == '+' || *ptr == '-')
		  && kind == K_number
		  && (ptr[-1] == 'e' || ptr[-1] == 'E'
		      || ptr[-1] == 'p' || ptr[-1] == 'P'))
		/* exponent */
		continue;
	      break;
	    }
	  if (*ptr == ':')
	    {
	      ptr++;
	      kind = K_label;
	    }
	  break;
	}

      if (alloc == num)
	{
	  alloc *= 2;
	  toks = XRESIZEVEC (Token, toks, alloc);
	}
      Token *tok = toks + num;

      tok->kind = kind;
      tok->space = ws;
      tok->end = 0;
      tok->ptr = base;
      tok->len = ptr - base - in_comment;
      in_comment &= 1;
      not_comment = kind != K_comment;
      if (eol && num)
	tok[-1].end = 1;
      if (!kind)
	break;
    }

  return toks;
}

/* Write an encoded token. */

static void
write_token (FILE *out, Token const *tok)
{
  if (tok->space)
    fputc (' ', out);

  switch (tok->kind)
    {
    case K_string:
      {
	const char *c = tok->ptr + 1;
	size_t len = tok->len - 2;

	fputs ("\\\"", out);
	while (len)
	  {
	    const char *bs = (const char *)memchr (c, '\\', len);
	    size_t l = bs ? bs - c : len;

	    fprintf (out, "%.*s", (int)l, c);
	    len -= l;
	    c += l;
	    if (bs)
	      {
		fputs ("\\\\", out);
		len--, c++;
	      }
	  }
	fputs ("\\\"", out);
      }
      break;

    default:
      /* All other tokens shouldn't have anything magic in them */
      fprintf (out, "%.*s", tok->len, tok->ptr);
      break;
    }
  if (tok->end)
    fputs ("\\n", out);
}

static void
write_tokens (FILE *out, Token const *toks, unsigned len, int spc)
{
  fputs ("\t\"", out);
  for (; len--; toks++)
    write_token (out, toks);
  if (spc)
    fputs (" ", out);
  fputs ("\"", out);
}

static Stmt *
alloc_stmt (unsigned vis, Token *tokens, Token *end, Token const *sym)
{
  static unsigned alloc = 0;
  static Stmt *heap = 0;

  if (!alloc)
    {
      alloc = 1000;
      heap = XNEWVEC (Stmt, alloc);
    }

  Stmt *stmt = heap++;
  alloc--;

  tokens->space = 0;
  stmt->next = 0;
  stmt->vis = vis;
  stmt->tokens = tokens;
  stmt->len = end - tokens;
  stmt->sym = sym ? sym - tokens : ~0;

  return stmt;
}

static Stmt *
rev_stmts (Stmt *stmt)
{
  Stmt *prev = 0;
  Stmt *next;

  while (stmt)
    {
      next = stmt->next;
      stmt->next = prev;
      prev = stmt;
      stmt = next;
    }

  return prev;
}

static void
write_stmt (FILE *out, const Stmt *stmt)
{
  if ((stmt->vis & V_mask) != V_comment)
    {
      write_tokens (out, stmt->tokens, stmt->len,
		    (stmt->vis & V_mask) == V_pred);
      fputs (stmt->vis & V_no_eol ? "\t" : "\n", out);
    }
}

static void
write_stmts (FILE *out, const Stmt *stmts)
{
  for (; stmts; stmts = stmts->next)
    write_stmt (out, stmts);
}

static Token *
parse_insn (Token *tok)
{
  unsigned depth = 0;

  do
    {
      Stmt *stmt;
      Token *sym = 0;
      unsigned s = V_insn;
      Token *start = tok;

      switch (tok++->kind)
	{
	case K_comment:
	  while (tok->kind == K_comment)
	    tok++;
	  stmt = alloc_comment (start, tok);
	  append_stmt (&fns, stmt);
	  continue;

	case '{':
	  depth++;
	  break;

	case '}':
	  depth--;
	  break;

	case K_label:
	  if (tok[-1].ptr[0] != '$')
	    sym = tok - 1;
	  tok[-1].end = 1;
	  s = V_label;
	  break;

	case '@':
	  tok->space = 0;
	  if (tok->kind == '!')
	    tok++;
	  if (tok->kind == K_symbol)
	    sym = tok;
	  tok++;
	  s = V_pred;
	  break;

	default:
	  for (; tok->kind != ';'; tok++)
	    {
	      if (tok->kind == ',')
		tok[1].space = 0;
	      else if (tok->kind == K_symbol)
		sym = tok;
	    }
	  tok++->end = 1;
	  break;
	}

      stmt = alloc_stmt (s, start, tok, sym);
      append_stmt (&fns, stmt);

      if (!tok[-1].end && tok[0].kind == K_comment)
	{
	  stmt->vis |= V_no_eol;
	  stmt = alloc_comment (tok, tok + 1);
	  append_stmt (&fns, stmt);
	  tok++;
	}
    }
  while (depth);

  return tok;
}

/* comma separated list of tokens */

static Token *
parse_list_nosemi (Token *tok)
{
  Token *start = tok;

  do
    if (!(++tok)->kind)
      break;
  while ((++tok)->kind == ',');

  tok[-1].end = 1;
  Stmt *stmt = alloc_stmt (V_dot, start, tok, 0);
  append_stmt (&decls, stmt);

  return tok;
}

#define is_keyword(T,S) \
  (sizeof (S) == (T)->len && !memcmp ((T)->ptr + 1, (S), (T)->len - 1))

static Token *
parse_init (Token *tok)
{
  for (;;)
    {
      Token *start = tok;
      Token const *sym = 0;
      Stmt *stmt;

      if (tok->kind == K_comment)
	{
	  while (tok->kind == K_comment)
	    tok++;
	  stmt = alloc_comment (start, tok);
	  append_stmt (&vars, stmt);
	  start = tok;
	}

      if (tok->kind == '{')
	tok[1].space = 0;
      for (; tok->kind != ',' && tok->kind != ';'; tok++)
	if (tok->kind == K_symbol)
	  sym = tok;
      tok[1].space = 0;
      int end = tok++->kind == ';';
      stmt = alloc_stmt (V_insn, start, tok, sym);
      append_stmt (&vars, stmt);
      if (!tok[-1].end && tok->kind == K_comment)
	{
	  stmt->vis |= V_no_eol;
	  stmt = alloc_comment (tok, tok + 1);
	  append_stmt (&vars, stmt);
	  tok++;
	}
      if (end)
	break;
    }
  return tok;
}

static Token *
parse_file (Token *tok)
{
  Stmt *comment = 0;

  if (tok->kind == K_comment)
    {
      Token *start = tok;

      while (tok->kind == K_comment)
	{
	  if (strncmp (tok->ptr, ":VAR_MAP ", 9) == 0)
	    record_id (tok->ptr + 9, &vars_tail);
	  if (strncmp (tok->ptr, ":FUNC_MAP ", 10) == 0)
	    record_id (tok->ptr + 10, &funcs_tail);
	  tok++;
	}
      comment = alloc_comment (start, tok);
      comment->vis |= V_prefix_comment;
    }

  if (tok->kind == K_dotted)
    {
      if (is_keyword (tok, "version")
	  || is_keyword (tok, "target")
	  || is_keyword (tok, "address_size"))
	{
	  if (comment)
	    append_stmt (&decls, comment);
	  tok = parse_list_nosemi (tok);
	}
      else
	{
	  unsigned vis = 0;
	  const Token *def = 0;
	  unsigned is_decl = 0;
	  Token *start;

	  for (start = tok;
	       tok->kind && tok->kind != '=' && tok->kind != K_comment
		 && tok->kind != '{' && tok->kind != ';'; tok++)
	    {
	      if (is_keyword (tok, "global"))
		vis |= V_var;
	      else if (is_keyword (tok, "func")
		       || is_keyword (tok, "entry"))
		vis |= V_func;
	      else if (is_keyword (tok, "visible"))
		vis |= V_global;
	      else if (is_keyword (tok, "extern"))
		is_decl = 1;
	      else if (is_keyword (tok, "weak"))
		vis |= V_weak;
	      if (tok->kind == '(')
		{
		  tok[1].space = 0;
		  tok[0].space = 1;
		}
	      else if (tok->kind == ')' && tok[1].kind != ';')
		tok[1].space = 1;

	      if (tok->kind == K_symbol)
		def = tok;
	    }

	  if (!tok->kind)
	    {
	      /* end of file */
	      if (comment)
		append_stmt (&fns, comment);
	    }
	  else if (tok->kind == '{'
		   || tok->kind == K_comment)
	    {
	      /* function defn */
	      Stmt *stmt = alloc_stmt (vis, start, tok, def);
	      if (comment)
		{
		  append_stmt (&fns, comment);
		  stmt->vis |= V_prefix_comment;
		}
	      append_stmt (&fns, stmt);
	      tok = parse_insn (tok);
	    }
	  else
	    {
	      int assign = tok->kind == '=';

	      tok++->end = 1;
	      if ((vis & V_mask) == V_var && !is_decl)
		{
		  /* variable */
		  Stmt *stmt = alloc_stmt (vis, start, tok, def);
		  if (comment)
		    {
		      append_stmt (&vars, comment);
		      stmt->vis |= V_prefix_comment;
		    }
		  append_stmt (&vars, stmt);
		  if (assign)
		    tok = parse_init (tok);
		}
	      else
		{
		  /* declaration */
		  Stmt *stmt = alloc_stmt (vis, start, tok, 0);
		  if (comment)
		    {
		      append_stmt (&decls, comment);
		      stmt->vis |= V_prefix_comment;
		    }
		  append_stmt (&decls, stmt);
		}
	    }
	}
    }
  else
    {
      /* Something strange.  Ignore it.  */
      if (comment)
	append_stmt (&fns, comment);

      while (tok->kind && !tok->end)
	tok++;
    }
  return tok;
}

static void
process (FILE *in, FILE *out)
{
  const char *input = read_file (in);
  Token *tok = tokenize (input);

  do
    tok = parse_file (tok);
  while (tok->kind);

  fprintf (out, "static const char ptx_code[] = \n");
  write_stmts (out, rev_stmts (decls));
  write_stmts (out, rev_stmts (vars));
  write_stmts (out, rev_stmts (fns));
  fprintf (out, ";\n\n");
  fprintf (out, "static const char *var_mappings[] = {\n");
  for (id_map *id = var_ids; id; id = id->next)
    fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
  fprintf (out, "};\n\n");
  fprintf (out, "static const char *func_mappings[] = {\n");
  for (id_map *id = func_ids; id; id = id->next)
    fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
  fprintf (out, "};\n\n");

  fprintf (out, "extern void GOMP_offload_register (const void *, const char *,\n");
  fprintf (out, "				    void *, void *, void *);\n");

  fprintf (out, "extern void *__OPENMP_TARGET__[];\n\n");
  fprintf (out, "static __attribute__((constructor)) void init (void)\n{\n");
  fprintf (out, "  GOMP_offload_register (__OPENMP_TARGET__, \"nvptx\",\n");
  fprintf (out, "                         func_mappings, var_mappings, &ptx_code);\n");
  fprintf (out, "};\n");
}

static void
compile_native (const char *infile, const char *outfile, const char *compiler)
{
  struct obstack argv_obstack;
  obstack_init (&argv_obstack);
  obstack_ptr_grow (&argv_obstack, compiler);
  obstack_ptr_grow (&argv_obstack, infile);
  obstack_ptr_grow (&argv_obstack, "-c");
  obstack_ptr_grow (&argv_obstack, "-o");
  obstack_ptr_grow (&argv_obstack, outfile);

  const char **new_argv = XOBFINISH (&argv_obstack, const char **);
  fork_execute (CONST_CAST (char **, new_argv));
  obstack_free (&argv_obstack, NULL);
}

int
main (int argc, char **argv)
{
  FILE *in = stdin;
  FILE *out = stdout;
  const char *outname = 0;

  char *collect_gcc = getenv ("COLLECT_GCC");
  if (collect_gcc == NULL)
    fatal ("COLLECT_GCC must be set.");
  const char *gcc_path = dirname (ASTRDUP (collect_gcc));
  fprintf (stderr, "COLLECT_GCC is %s\n", collect_gcc);

  size_t len = (strlen (DEFAULT_REAL_TARGET_MACHINE)
		+ strlen (DEFAULT_TARGET_MACHINE)
		+ strlen ("-accel--gcc") + 1
		+ strlen (gcc_path) + 1);
  char *driver = XALLOCAVEC (char, len);
  sprintf (driver, "%s/%s-accel-%s-gcc", gcc_path,
	   DEFAULT_REAL_TARGET_MACHINE, DEFAULT_TARGET_MACHINE);

  /* We may be called with all the arguments stored in some file and
     passed with @file.  Expand them into argv before processing.  */
  expandargv (&argc, &argv);

  struct obstack argv_obstack;
  obstack_init (&argv_obstack);
  obstack_ptr_grow (&argv_obstack, driver);
  obstack_ptr_grow (&argv_obstack, "-xlto");
  obstack_ptr_grow (&argv_obstack, "-m64");
  obstack_ptr_grow (&argv_obstack, "-S");

  for (int ix = 1; ix != argc; ix++)
    {
      if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
	outname = argv[++ix];
      else
	obstack_ptr_grow (&argv_obstack, argv[ix]);
    }
  const char *tempfile = make_temp_file (".mkoffload");
  obstack_ptr_grow (&argv_obstack, "-o");
  obstack_ptr_grow (&argv_obstack, tempfile);
  const char **new_argv = XOBFINISH (&argv_obstack, const char **);

  char *execpath = getenv ("GCC_EXEC_PREFIX");
  char *cpath = getenv ("COMPILER_PATH");
  char *lpath = getenv ("LIBRARY_PATH");
  unsetenv ("GCC_EXEC_PREFIX");
  unsetenv ("COMPILER_PATH");
  unsetenv ("LIBRARY_PATH");

  fork_execute (CONST_CAST (char **, new_argv));
  obstack_free (&argv_obstack, NULL);

  xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL));
  xputenv (concat ("COMPILER_PATH=", cpath, NULL));
  xputenv (concat ("LIBRARY_PATH=", lpath, NULL));

  in = fopen (tempfile, "r");
  if (!in)
    fatal ("cannot open intermediate ptx file");

  const char *ptx_cfile = make_temp_file (".c");

  out = fopen (ptx_cfile, "w");
  if (!out)
    fatal ("cannot open '%s'", ptx_cfile);

  process (in, out);
  fclose (out);

  compile_native (ptx_cfile, outname, collect_gcc);

  return 0;
}

Comments

Ilya Verbin Feb. 21, 2014, 3:17 p.m. UTC | #1
2014-02-20 22:27 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
> There were still a number of things in these patches that did not make sense
> to me and which I've changed. Let me know if there was a good reason for the
> way some of these things were originally done.
>  * Functions and variables now go into different tables, otherwise
>    intermixing between them could be a problem that causes tables to
>    go out of sync between host and target (imagine one big table being
>    generated by ptx lto1/mkoffload, and multiple small table fragments
>    being linked together on the host side).

What do you mean by multiple small table fragments?
The tables from every object file should be joined together while
linking DSO in the same order for both host and target.
If you need to join tables from multiple target images into one big
table, the host tables also should be joined in the same order. In our
case we're obtaining each target table while loading the image to
target device, and merging it with a corresponding host table.
How splitting functions and global vars into 2 tables will help to
avoid intermixing?

>  * Is there a reason to call a register function for the host tables?
>    The way I've set it up, we register a target function/variable table
>    while also passing a pointer to the __OPENMP_TARGET__ symbol which
>    holds information about the host side tables.

Suppose there is liba, that depends on libb, that depends on libc.
Also corresponding target image tgtimga depends on tgtimgb, that
depends on tgtimgc. When liba is going to start offloaded function, it
calls GOMP_target with a pointer to its descriptor, which contains a
pointer to tgtimga. But how does GOMP_target know that it should also
load tgtimgb and tgtimgc to target? And where to get their descriptors
from?
That's why we have added host-side DSO registration. In this example
they are loaded on host in the following order: libc, libb, liba. In
the same order they are registered in libgomp, and loaded to target
device while initialization. In the same order the tables received
from target are merged with the host tables from the descriptors.

> I'm appending those parts of my current patch kit that seem relevant. This
> includes the ptx mkoffload tool and a patch to make a dummy
> GOMP_offload_register function. Most of the others are updated versions of
> patches I've posted before, and two adapted from Michael Zolotukhin's set
> (automatically generated files not included in the diffs for size reasons).
> How does this look?

I will take a closer look at you changes, try to run it, and send
feedback next week.

  -- Ilya
Bernd Schmidt Feb. 21, 2014, 3:41 p.m. UTC | #2
On 02/21/2014 04:17 PM, Ilya Verbin wrote:
> 2014-02-20 22:27 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
>> There were still a number of things in these patches that did not make sense
>> to me and which I've changed. Let me know if there was a good reason for the
>> way some of these things were originally done.
>>   * Functions and variables now go into different tables, otherwise
>>     intermixing between them could be a problem that causes tables to
>>     go out of sync between host and target (imagine one big table being
>>     generated by ptx lto1/mkoffload, and multiple small table fragments
>>     being linked together on the host side).
>
> What do you mean by multiple small table fragments?

Well, suppose you have file1.o and file2.o compiled for the host with a 
.offload_func_table_section in each, and they get linked together - each 
provides a fragment of the whole table.

> The tables from every object file should be joined together while
> linking DSO in the same order for both host and target.
> If you need to join tables from multiple target images into one big
> table, the host tables also should be joined in the same order.

The problem is that ptx does not have a linker, so we cannot exactly 
reproduce what happens on the host side. We have to process all host .o 
files in one single invocation of ptx lto1, and produce a single ptx 
assembly file, with a single function/variable table, from there. Having 
functions and variables separated gives us at least a small chance that 
the order will match that found in the host tables if the host table is 
produced by linking multiple fragments.

> Suppose there is liba, that depends on libb, that depends on libc.

What kind of dependencies between liba and libb do you expect to be able 
to support on the target side? References to each other's functions and 
variables?


Bernd
Ilya Verbin Feb. 21, 2014, 6 p.m. UTC | #3
2014-02-21 19:41 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
> The problem is that ptx does not have a linker, so we cannot exactly
> reproduce what happens on the host side. We have to process all host .o
> files in one single invocation of ptx lto1, and produce a single ptx
> assembly file, with a single function/variable table, from there. Having
> functions and variables separated gives us at least a small chance that the
> order will match that found in the host tables if the host table is produced
> by linking multiple fragments.

If ptx lto1 will process all .o files in order as they were passed to
it, the resulting table should be consistent with the table produced
by host's lto1.

> What kind of dependencies between liba and libb do you expect to be able to
> support on the target side? References to each other's functions and
> variables?

Yes, references to global variables and calls to functions, marked
with "omp declare target".
Ilya Verbin Feb. 28, 2014, 4:09 p.m. UTC | #4
2014-02-20 22:27 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
>  * Functions and variables now go into different tables, otherwise
>    intermixing between them could be a problem that causes tables to
>    go out of sync between host and target (imagine one big table being
>    generated by ptx lto1/mkoffload, and multiple small table fragments
>    being linked together on the host side).

If you need 2 different tables for funcs and vars, we can also use
them. But I still don't understand how it will help synchronization
between host and target tables.

>  * I've put the begin/end fragments for the host tables into crtstuff,
>    which seems like the standard way of doing things.

Our plan was that the host side descriptor __OPENMP_TARGET__ will
contain (in addition to func/var table) pointers to the images for all
enabled accelerators (e.g. omp_image_nvptx_start and
omp_image_intelmic_start), therefore we generated it in the
lto-wrapper. But if the number of accelerators and their types/names
will be defined during configuration, then it's ok to generate the
descriptor in crtstuff.

>  * Is there a reason to call a register function for the host tables?
>    The way I've set it up, we register a target function/variable table
>    while also passing a pointer to the __OPENMP_TARGET__ symbol which
>    holds information about the host side tables.

In our case we can't register target table with a call to libgomp, it
can be obtained only from the accelerator. Therefore we propose a
target-independent approach: during device initialization libgomp
calls 2 functions from the plugin (or this can be implemented by a
single function):
1. devicep->device_load_image_func, which will load target image (its
pointer will be taken from the host descriptor);
2. devicep->device_get_table_func, which in our case connects to the
device and receives its table. And in your case it will return
func_mappings and var_mappings. Will it work for you?

>  * An offload compiler is built with --enable-as-accelerator-for=, which
>    eliminates the need for -fopenmp-target, and changes install paths so
>    that the host compiler knows where to find it. No need for
>    OFFLOAD_TARGET_COMPILERS anymore.

Unfortunately I don't fully understand this configure magic... When a
user specifies 2 or 3 accelerators during configuration with
--enable-accelerators, will several different accel-gccs be built?

Thanks,
  -- Ilya
Bernd Schmidt Feb. 28, 2014, 4:21 p.m. UTC | #5
On 02/28/2014 05:09 PM, Ilya Verbin wrote:
> 2014-02-20 22:27 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
>>   * Functions and variables now go into different tables, otherwise
>>     intermixing between them could be a problem that causes tables to
>>     go out of sync between host and target (imagine one big table being
>>     generated by ptx lto1/mkoffload, and multiple small table fragments
>>     being linked together on the host side).
>
> If you need 2 different tables for funcs and vars, we can also use
> them. But I still don't understand how it will help synchronization
> between host and target tables.

I think it won't help that much - I still think this entire scheme is 
likely to fail on nvptx. I'll try to construct an example at some point.

One other thing about the split tables is that we don't have to write a 
useless size of 1 for functions.

>>   * I've put the begin/end fragments for the host tables into crtstuff,
>>     which seems like the standard way of doing things.
>
> Our plan was that the host side descriptor __OPENMP_TARGET__ will
> contain (in addition to func/var table) pointers to the images for all
> enabled accelerators (e.g. omp_image_nvptx_start and
> omp_image_intelmic_start), therefore we generated it in the
> lto-wrapper.

The concept of "image" is likely to vary somewhat between accelerators. 
For ptx, it's just a string and it can't really be generated the same 
way as for your target where you can manipulate ELF images. So I think 
it is better to have a call to a gomp registration function for every 
offload target. That should also give you the ordering you said you 
wanted between shared libraries.

>>   * Is there a reason to call a register function for the host tables?
>>     The way I've set it up, we register a target function/variable table
>>     while also passing a pointer to the __OPENMP_TARGET__ symbol which
>>     holds information about the host side tables.
>
> In our case we can't register target table with a call to libgomp, it
> can be obtained only from the accelerator. Therefore we propose a
> target-independent approach: during device initialization libgomp
> calls 2 functions from the plugin (or this can be implemented by a
> single function):
> 1. devicep->device_load_image_func, which will load target image (its
> pointer will be taken from the host descriptor);
> 2. devicep->device_get_table_func, which in our case connects to the
> device and receives its table. And in your case it will return
> func_mappings and var_mappings. Will it work for you?

Probably. I think the constructor call to the gomp registration function 
would contain an opaque pointer to whatever data the target wants, so it 
can arrange its image/table data in whatever way it likes.

It would help to see the code you have on the libgomp side, I don't 
believe that's been posted yet?

> Unfortunately I don't fully understand this configure magic... When a
> user specifies 2 or 3 accelerators during configuration with
> --enable-accelerators, will several different accel-gccs be built?

No - the idea is that --enable-accelerator= is likely specific to ptx, 
where we really just want to build a gcc and no target libraries, so 
building it alongside the host in an accel-gcc subdirectory is ideal.

For your use case, I'd imagine the offload compiler would be built 
relatively normally as a full build with 
"--enable-as-accelerator-for=x86_64-linux", which would install it into 
locations where the host will eventually be able to find it. Then the 
host compiler would be built with another new configure option (as yet 
unimplemented in my patch set) "--enable-offload-targets=mic,..." which 
would tell the host compiler about the pre-built offload target 
compilers. On the ptx side, "--enable-accelerator=ptx" would then also 
add ptx to the list of --enable-offload-targets.
Naming of all these configure options can be discussed, I have no real 
preference for any of them.


Bernd
Ilya Verbin March 5, 2014, 5:15 p.m. UTC | #6
On 28 Feb 17:21, Bernd Schmidt wrote:
> I think it won't help that much - I still think this entire scheme
> is likely to fail on nvptx. I'll try to construct an example at some
> point.
> 
> One other thing about the split tables is that we don't have to
> write a useless size of 1 for functions.
> 
> 
> The concept of "image" is likely to vary somewhat between
> accelerators. For ptx, it's just a string and it can't really be
> generated the same way as for your target where you can manipulate
> ELF images. So I think it is better to have a call to a gomp
> registration function for every offload target. That should also
> give you the ordering you said you wanted between shared libraries.
> 
> 
> Probably. I think the constructor call to the gomp registration
> function would contain an opaque pointer to whatever data the target
> wants, so it can arrange its image/table data in whatever way it
> likes.

Assuming that we're using the scheme with tables.
Every DSO with offloading must contain a constructor call to GOMP_offload_register (const void *openmp_target);
The openmp_target descriptor in every DSO will have target-independent entries (addresses of host tables) and target-dependent entries for each target. Its format may be like this:

void *__OPENMP_TARGET__[] =
{
  _omp_host_func_table;
  _omp_host_funcs_end;
  _omp_host_var_table;
  _omp_host_vars_end;
  _omp_num_targets;
  _omp_target_descs[]; /* array of tgt_desc */
}

struct tgt_desc
{
  int _omp_tgt_id;
  void *_omp_tgt_%s_image_start;
  void *_omp_tgt_%s_image_end;
  void *_omp_tgt_%s_func_mappings;
  void *_omp_tgt_%s_var_mappings;
  /* some other data if needed */
}

The mkoffload tool will fill those symbols, that are required by the corresponding target.
E.g. for the MIC and PTX targets the openmp_target descriptor will look like:

{
  &_omp_host_func_table,
  &_omp_host_funcs_end,
  &_omp_host_var_table,
  &_omp_host_vars_end,
  2,

  MIC_ID,
  &_omp_tgt_mic_image_start,
  &_omp_tgt_mic_image_end,
  &_omp_tgt_mic_func_mappings, /* 0 */
  &_omp_tgt_mic_var_mappings, /* 0 */

  PTX_ID,
  &_omp_tgt_ptx_image_start,
  &_omp_tgt_ptx_image_end, /* 0 */
  &_omp_tgt_ptx_func_mappings,
  &_omp_tgt_ptx_var_mappings
}

During the devices initialization libgomp will pass the openmp_target pointer to all plugins. Each plugin will scan over tgt_descs and find the required entries using the _omp_tgt_id.
Then the plugin loads the image to the target, does whatever it wants, and returns func_mappings and var_mappings to libgomp, because libgomp has to add host-target mapping into the splay tree.
How does this look?

BTW, do you have any estimate when you will commit your patches to the branch, so that we could merge them with ours, and get something working for everybody?

  -- Ilya
Bernd Schmidt March 6, 2014, 8:47 a.m. UTC | #7
On 03/05/2014 06:15 PM, Ilya Verbin wrote:
> On 28 Feb 17:21, Bernd Schmidt wrote:
>> I think it won't help that much - I still think this entire scheme
>> is likely to fail on nvptx. I'll try to construct an example at
>> some point.
>>
>> One other thing about the split tables is that we don't have to
>> write a useless size of 1 for functions.
>>
>>
>> The concept of "image" is likely to vary somewhat between
>> accelerators. For ptx, it's just a string and it can't really be
>> generated the same way as for your target where you can manipulate
>> ELF images. So I think it is better to have a call to a gomp
>> registration function for every offload target. That should also
>> give you the ordering you said you wanted between shared
>> libraries.
>>
>>
>> Probably. I think the constructor call to the gomp registration
>> function would contain an opaque pointer to whatever data the
>> target wants, so it can arrange its image/table data in whatever
>> way it likes.
>
> Assuming that we're using the scheme with tables. Every DSO with
> offloading must contain a constructor call to GOMP_offload_register
> (const void *openmp_target); The openmp_target descriptor in every
> DSO will have target-independent entries (addresses of host tables)
> and target-dependent entries for each target. Its format may be like
> this:
>
> void *__OPENMP_TARGET__[] = { _omp_host_func_table;
> _omp_host_funcs_end; _omp_host_var_table; _omp_host_vars_end;
> _omp_num_targets; _omp_target_descs[]; /* array of tgt_desc */ }

I don't see why you want the array of target descriptors - it would take
some effort to construct, and as far as I can tell it's unnecessary. You
can just pass a pointer to the corresponding descriptor to every
GOMP_offload_register call.


> struct tgt_desc { int _omp_tgt_id; void *_omp_tgt_%s_image_start;
> void *_omp_tgt_%s_image_end; void *_omp_tgt_%s_func_mappings; void
> *_omp_tgt_%s_var_mappings; /* some other data if needed */ }

This looks reasonable.

> During the devices initialization libgomp will pass the openmp_target
> pointer to all plugins. Each plugin will scan over tgt_descs and find
> the required entries using the _omp_tgt_id.

Once again, that seems unnecessarily complicated. The plugins can 
register their target ID with libgomp, and when libgomp sees a 
GOMP_offload_register call with the corresponding target ID, it can 
invoke the appropriate plugin immediately.

> BTW, do you have any estimate when you will commit your patches to
> the branch, so that we could merge them with ours, and get something
> working for everybody?

I've been waiting for us to reach agreement on how things should look. 
If there are patches in the series that you're happy with, let me know 
and I can commit them (it may be next week though).


Bernd
Ilya Verbin March 6, 2014, 11:11 a.m. UTC | #8
2014-03-06 12:47 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
> I don't see why you want the array of target descriptors - it would take
> some effort to construct, and as far as I can tell it's unnecessary. You
> can just pass a pointer to the corresponding descriptor to every
> GOMP_offload_register call.
>
> Once again, that seems unnecessarily complicated. The plugins can register
> their target ID with libgomp, and when libgomp sees a GOMP_offload_register
> call with the corresponding target ID, it can invoke the appropriate plugin
> immediately.

Do I understand correctly, that you propose to do so:

extern void *_omp_host_func_table[];
extern void *_omp_host_var_table[];
extern void *_omp_host_funcs_end[];
extern void *_omp_host_vars_end[];

void *__OPENMP_TARGET_HOST__[]
__attribute__ ((visibility ("protected"))) =
{
  &_omp_host_func_table, &_omp_host_funcs_end,
  &_omp_host_var_table, &_omp_host_vars_end
};

extern void *__OPENMP_TARGET_MIC__[];
extern void *__OPENMP_TARGET_PTX__[];
extern void GOMP_offload_register_host (const void *);
extern void GOMP_offload_register_target (const void *);

__attribute__ ((constructor))
static void
init (void)
{
  GOMP_offload_register_host (__OPENMP_TARGET_HOST__);
  GOMP_offload_register_target (__OPENMP_TARGET_MIC__);
  GOMP_offload_register_target (__OPENMP_TARGET_PTX__);
}

Where __OPENMP_TARGET_MIC__ and __OPENMP_TARGET_PTX__ descriptors
should be generated in the corresponding mkoffload tools.

  -- Ilya
Bernd Schmidt March 6, 2014, 11:53 a.m. UTC | #9
On 03/06/2014 12:11 PM, Ilya Verbin wrote:

> Do I understand correctly, that you propose to do so:
>
> extern void *_omp_host_func_table[];
> extern void *_omp_host_var_table[];
> extern void *_omp_host_funcs_end[];
> extern void *_omp_host_vars_end[];
>
> void *__OPENMP_TARGET_HOST__[]
> __attribute__ ((visibility ("protected"))) =
> {
>    &_omp_host_func_table, &_omp_host_funcs_end,
>    &_omp_host_var_table, &_omp_host_vars_end
> };

So far, yes (maybe just call it __OPENMP_HOST_TABLE__).

> extern void *__OPENMP_TARGET_MIC__[];
> extern void *__OPENMP_TARGET_PTX__[];
> extern void GOMP_offload_register_host (const void *);
> extern void GOMP_offload_register_target (const void *);
>
> __attribute__ ((constructor))
> static void
> init (void)
> {
>    GOMP_offload_register_host (__OPENMP_TARGET_HOST__);
>    GOMP_offload_register_target (__OPENMP_TARGET_MIC__);
>    GOMP_offload_register_target (__OPENMP_TARGET_PTX__);
> }
>
> Where __OPENMP_TARGET_MIC__ and __OPENMP_TARGET_PTX__ descriptors
> should be generated in the corresponding mkoffload tools.

No. I don't think we need a global constructor for registering 
__OPENMP_TARGET_HOST__ - this would unnecessarily bloat crtbegin/crtend. 
  We also shouldn't need to have the target tables known outside of the 
image constructed by the mkoffload tools.  The way I imagine it, every 
mkoffload tool creates its own constructor that looks like something 
like this:

__attribute__ ((constructor)) static void
init (void)
{
    GOMP_offload_register_target (__OPENMP_TARGET_HOST__,
                                  PTX_ID, ptx_target_table);
}

That creates a mapping between host and target table for PTX_ID. If 
there are multiple shared libraries with offload support, you can still 
obtain the ordering you want from these GOMP_offload_register_target 
calls. Everything is nicely private to the mkoffload-generated image.

It's implemented in almost this fashion (slightly different naming and 
args, and no real support in libgomp) in the patch kit I sent.


Bernd
Ilya Verbin March 6, 2014, 12:52 p.m. UTC | #10
2014-03-06 15:53 GMT+04:00 Bernd Schmidt <bernds@codesourcery.com>:
> No. I don't think we need a global constructor for registering
> __OPENMP_TARGET_HOST__ - this would unnecessarily bloat crtbegin/crtend.  We
> also shouldn't need to have the target tables known outside of the image
> constructed by the mkoffload tools.  The way I imagine it, every mkoffload
> tool creates its own constructor that looks like something like this:
>
>
> __attribute__ ((constructor)) static void
> init (void)
> {
>    GOMP_offload_register_target (__OPENMP_TARGET_HOST__,
>                                  PTX_ID, ptx_target_table);
> }
>
> That creates a mapping between host and target table for PTX_ID. If there
> are multiple shared libraries with offload support, you can still obtain the
> ordering you want from these GOMP_offload_register_target calls. Everything
> is nicely private to the mkoffload-generated image.
>
> It's implemented in almost this fashion (slightly different naming and args,
> and no real support in libgomp) in the patch kit I sent.

OK, now I get it, this looks good. I will rewrite the patch for
libgomp posted above to support this scheme.
Since we will pass __OPENMP_HOST_TABLE__ to GOMP_offload_register,
there is no need to pass it to GOMP_target[data/update], right?

  -- Ilya
Ilya Verbin June 17, 2014, 6:20 p.m. UTC | #11
Hello Bernd,

On 28 Feb 17:21, Bernd Schmidt wrote:
> For your use case, I'd imagine the offload compiler would be built
> relatively normally as a full build with
> "--enable-as-accelerator-for=x86_64-linux", which would install it
> into locations where the host will eventually be able to find it.
> Then the host compiler would be built with another new configure
> option (as yet unimplemented in my patch set)
> "--enable-offload-targets=mic,..." which would tell the host
> compiler about the pre-built offload target compilers. On the ptx

I don't get this part of the plan.  Where a host compiler will look for mkoffloads?

E.g., first I configure/make/install the target gcc and corresponding mkoffload with the following options:
--enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-unknown-linux --prefix=/install_gcc/accel_intelmic

Next I configure/make/install the host gcc with:
--enable-accelerator=intelmic --prefix=/install_gcc/host

Now if I manually copy mkoffload from target's install dir into one of the dirs in host's $COMPILER_PATH,
then lto-wrapper finds it and everything works fine.
E.g.: mkdir -p /install_gcc/host/libexec/gcc/x86_64-unknown-linux-gnu/accel/intelmic/ &&
cp /install_gcc/accel_intelmic/libexec/gcc/x86_64-unknown-linux/4.10.0/accel/x86_64-unknown-linux-gnu/mkoffload
/install_gcc/host/libexec/gcc/x86_64-unknown-linux-gnu/accel/intelmic/

But what was your idea of how to tell host gcc about the path to mkoffload?

Thanks,
  -- Ilya
Bernd Schmidt June 17, 2014, 7:22 p.m. UTC | #12
On 06/17/2014 08:20 PM, Ilya Verbin wrote:
> Hello Bernd,
>
> On 28 Feb 17:21, Bernd Schmidt wrote:
>> For your use case, I'd imagine the offload compiler would be built
>> relatively normally as a full build with
>> "--enable-as-accelerator-for=x86_64-linux", which would install it
>> into locations where the host will eventually be able to find it.
>> Then the host compiler would be built with another new configure
>> option (as yet unimplemented in my patch set)
>> "--enable-offload-targets=mic,..." which would tell the host
>> compiler about the pre-built offload target compilers. On the ptx
>
> I don't get this part of the plan.  Where a host compiler will look for mkoffloads?
>
> E.g., first I configure/make/install the target gcc and corresponding mkoffload with the following options:
> --enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-unknown-linux --prefix=/install_gcc/accel_intelmic
>
> Next I configure/make/install the host gcc with:
> --enable-accelerator=intelmic --prefix=/install_gcc/host

Try using the same prefix for both.


Bernd
Ilya Verbin June 18, 2014, 2:13 p.m. UTC | #13
On 17 Jun 21:22, Bernd Schmidt wrote:
> On 06/17/2014 08:20 PM, Ilya Verbin wrote:
> >I don't get this part of the plan.  Where a host compiler will look for mkoffloads?
> >
> >E.g., first I configure/make/install the target gcc and corresponding mkoffload with the following options:
> >--enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-unknown-linux --prefix=/install_gcc/accel_intelmic
> >
> >Next I configure/make/install the host gcc with:
> >--enable-accelerator=intelmic --prefix=/install_gcc/host
> 
> Try using the same prefix for both.

I tried to do:
1. --enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-intelmic-linux-gnu --prefix=/install_gcc/both
2. --enable-accelerator=intelmic --prefix=/install_gcc/both

In this case only bin/x86_64-intelmic-linux-gnu-accel-intelmic-gcc from accel compiler is saved.
All other binaries in bin, lib, lib64, libexec are replaced by host's ones.
Is there a way to have 2 working compilers and libs in the same prefix?

Thanks,
  -- Ilya
Bernd Schmidt June 18, 2014, 2:22 p.m. UTC | #14
On 06/18/2014 04:13 PM, Ilya Verbin wrote:
> On 17 Jun 21:22, Bernd Schmidt wrote:
>> On 06/17/2014 08:20 PM, Ilya Verbin wrote:
>>> I don't get this part of the plan.  Where a host compiler will look for mkoffloads?
>>>
>>> E.g., first I configure/make/install the target gcc and corresponding mkoffload with the following options:
>>> --enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-unknown-linux --prefix=/install_gcc/accel_intelmic
>>>
>>> Next I configure/make/install the host gcc with:
>>> --enable-accelerator=intelmic --prefix=/install_gcc/host
>>
>> Try using the same prefix for both.
>
> I tried to do:
> 1. --enable-accelerator=intelmic --enable-as-accelerator-for=x86_64-intelmic-linux-gnu --prefix=/install_gcc/both
> 2. --enable-accelerator=intelmic --prefix=/install_gcc/both
>
> In this case only bin/x86_64-intelmic-linux-gnu-accel-intelmic-gcc from accel compiler is saved.
> All other binaries in bin, lib, lib64, libexec are replaced by host's ones.
> Is there a way to have 2 working compilers and libs in the same prefix?

Sure, as long as the target triplet is different.

What I think you need to do is
For the first compiler: --enable-as-accelerator-for=x86_64-pc-linux-gnu 
--target=x86_64-intelmic-linux-gnu --prefix=/somewhere
Build and install, then:
For the second: configure 
--enable-offload-targets=x86_64-intelmic-linux-gnu x86_64-pc-linux-gnu 
--prefix=/somewhere

No --enable-accelerator options at all. This should work, if it doesn't 
let me know what you find in /somewhere after installation for both 
compilers.


Bernd
Ilya Verbin June 19, 2014, 10:19 a.m. UTC | #15
On 18 Jun 16:22, Bernd Schmidt wrote:
> What I think you need to do is
> For the first compiler:
> --enable-as-accelerator-for=x86_64-pc-linux-gnu
> --target=x86_64-intelmic-linux-gnu --prefix=/somewhere
> 
> No --enable-accelerator options at all. This should work, if it
> doesn't let me know what you find in /somewhere after installation
> for both compilers.

It doesn't work without --enable-accelerator:

--enable-as-accelerator-for requires --enable-accelerator
make[1]: *** [configure-gcc] Error 1

  -- Ilya
diff mbox

Patch

Index: libgomp/libgomp.map
===================================================================
--- libgomp/libgomp.map	(revision 207857)
+++ libgomp/libgomp.map	(working copy)
@@ -226,6 +226,7 @@  GOMP_4.0 {
 	GOMP_target_end_data;
 	GOMP_target_update;
 	GOMP_teams;
+	GOMP_offload_register;
 } GOMP_3.0;
 
 OACC_2.0 {
Index: libgomp/libgomp_g.h
===================================================================
--- libgomp/libgomp_g.h	(revision 207857)
+++ libgomp/libgomp_g.h	(working copy)
@@ -213,7 +213,8 @@  extern void GOMP_target_end_data (void);
 extern void GOMP_target_update (int, const void *,
 				size_t, void **, size_t *, unsigned char *);
 extern void GOMP_teams (unsigned int, unsigned int);
-
+extern void GOMP_offload_register (const void *, const char *,
+				   const void *, const void *, void *);
 /* oacc-parallel.c */
 
 extern void GOACC_parallel (int, void (*) (void *), const void *,
Index: libgomp/target.c
===================================================================
--- libgomp/target.c	(revision 207857)
+++ libgomp/target.c	(working copy)
@@ -714,6 +714,13 @@  gomp_target_init (void)
   gomp_find_available_plugins ();
 }
 
+void
+GOMP_offload_register (const void *target_id, const char *target_name,
+		       const void *func_mappings, const void *var_mappings,
+		       void *target_data)
+{
+}
+
 #else /* PLUGIN_SUPPORT */
 /* If dlfcn.h is unavailable we always fallback to host execution.
    GOMP_target* routines are just stubs for this case.  */