libffi/0059-aarch64-Tidy-up-abi-manipulation.patch

From 8c8161cb623585d5d0c783b9d494b9b74ada6ced Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@redhat.com>
Date: Wed, 22 Oct 2014 12:52:07 -0400
Subject: [PATCH 059/415] aarch64: Tidy up abi manipulation

Avoid false abstraction, like get_x_addr.  Avoid recomputing data
about the type being manipulated.  Use NEON insns for HFA manipulation.

Note that some of the inline assembly will go away in a subsequent patch.
---
 src/aarch64/ffi.c | 932 +++++++++++++++++++++---------------------------------
 1 file changed, 367 insertions(+), 565 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 6c338e1..d19384b 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,152 +71,6 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }

-static void *
-get_x_addr (struct call_context *context, unsigned n)
-{
-  return &context->x[n];
-}
-
-static void *
-get_s_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1].s[1];
-#else
-  return &context->v[n].d[0].s[0];
-#endif
-}
-
-static void *
-get_d_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1];
-#else
-  return &context->v[n].d[0];
-#endif
-}
-
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-		     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    case FFI_TYPE_VOID:
-      return NULL;
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-#if defined (__APPLE__)
-      return sizeof (UINT32);
-#endif
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-#if defined (__APPLE__)
-	  return sizeof (UINT8);
-#endif
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-#if defined (__APPLE__)
-	  return sizeof (UINT16);
-#endif
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-#if defined (__APPLE__)
-	  return sizeof (UINT32);
-#endif
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
-/* Return the size in bytes for each of the basic types.  */
-
-static size_t
-get_basic_type_size (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
 extern void
 ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
 			    extended_cif *),
@@ -468,223 +322,211 @@ arg_init (struct arg_state *state, size_t call_frame_size)
 #endif
 }

-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG);
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_v_addr (context, (state->nsrn)++);
-}
-
 /* Allocate an aligned slot on the stack and return a pointer to it.  */
 static void *
-allocate_to_stack (struct arg_state *state, void *stack, size_t alignment,
-		   size_t size)
+allocate_to_stack (struct arg_state *state, void *stack,
+		   size_t alignment, size_t size)
 {
-  void *allocation;
+  size_t nsaa = state->nsaa;

   /* Round up the NSAA to the larger of 8 or the natural
      alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
 #if defined (__APPLE__)
-  if (state->allocating_variadic)
-    state->nsaa = ALIGN (state->nsaa, 8);
+  if (state->allocating_variadic && alignment < 8)
+    alignment = 8;
 #else
-  state->nsaa = ALIGN (state->nsaa, 8);
+  if (alignment < 8)
+    alignment = 8;
 #endif
+
+  nsaa = ALIGN (nsaa, alignment);
+  state->nsaa = nsaa + size;

-  allocation = stack + state->nsaa;
-
-  state->nsaa += size;
-  return allocation;
+  return (char *)stack + nsaa;
 }

-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+static ffi_arg
+extend_integer_type (void *source, int type)
 {
-  /* This is necessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
-    case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
-      break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
-      break;
+      return *(UINT8 *) source;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
-      break;
+      return *(SINT8 *) source;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
-      break;
+      return *(UINT16 *) source;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
-      break;
+      return *(SINT16 *) source;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
-      break;
+      return *(UINT32 *) source;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
-      break;
-    case FFI_TYPE_POINTER:
+      return *(SINT32 *) source;
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
-      break;
-    case FFI_TYPE_VOID:
+      return *(UINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      return *(uintptr_t *) source;
     default:
-      FFI_ASSERT (0);
+      abort();
     }
 }

 static void
-copy_hfa_to_reg_or_stack (void *memory,
-			  ffi_type *ty,
-			  struct call_context *context,
-			  unsigned char *stack,
-			  struct arg_state *state)
-{
-  int h = is_hfa (ty);
-  int type = h & 0xff;
-  unsigned elems = h >> 8;
-
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
-	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
-	 and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-	      memory,
-	      ty->size);
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < elems; i++)
-	{
-	  void *reg = allocate_to_v (context, state);
-	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
-	}
-    }
+extend_hfa_type (void *dest, void *src, int h)
+{
+  int n = (h >> 8);
+  int t = h & 0xff;
+  int f = (t - FFI_TYPE_FLOAT) * 4 + 4 - n;
+  void *x0;
+
+  asm volatile (
+	"adr	%0, 0f\n"
+"	add	%0, %0, %1\n"
+"	br	%0\n"
+"0:	ldp	s16, s17, [%3]\n"	/* S4 */
+"	ldp	s18, s19, [%3, #8]\n"
+"	b	4f\n"
+"	ldp	s16, s17, [%3]\n"	/* S3 */
+"	ldr	s18, [%3, #8]\n"
+"	b	3f\n"
+"	ldp	s16, s17, [%3]\n"	/* S2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	s16, [%3]\n"		/* S1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	d16, d17, [%3]\n"	/* D4 */
+"	ldp	d18, d19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	d16, d17, [%3]\n"	/* D3 */
+"	ldr	d18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	d16, d17, [%3]\n"	/* D2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	d16, [%3]\n"		/* D1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	q16, q17, [%3]\n"	/* Q4 */
+"	ldp	q18, q19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q3 */
+"	ldr	q18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	q16, [%3]\n"		/* Q1 */
+"	b	1f\n"
+"4:	str	q19, [%2, #48]\n"
+"3:	str	q18, [%2, #32]\n"
+"2:	str	q17, [%2, #16]\n"
+"1:	str	q16, [%2]"
+    : "=&r"(x0)
+    : "r"(f * 12), "r"(dest), "r"(src)
+    : "memory", "v16", "v17", "v18", "v19");
 }

-/* Either allocate an appropriate register for the argument type, or if
-   none are available, allocate a stack slot and return a pointer
-   to the allocated space.  */
-
 static void *
-allocate_to_register_or_stack (struct call_context *context,
-			       unsigned char *stack,
-			       struct arg_state *state,
-			       unsigned short type)
+compress_hfa_type (void *dest, void *reg, int h)
 {
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  int n = h >> 8;
+  switch (h & 0xff)
     {
     case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
-	 should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
+      switch (n)
+	{
+	default:
+	  if (dest == reg)
+	    {
+#ifdef __AARCH64EB__
+	      dest += 12;
+#endif
+	    }
+	  else
+	    *(float *)dest = *(float *)reg;
+	  break;
+	case 2:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "st2 { v16.s, v17.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+	  break;
+	case 3:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldr q18, [%1, #32]\n\t"
+	      "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+	  break;
+	case 4:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldp q18, q19, [%1, #32]\n\t"
+	      "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+	  break;
+	}
+      break;
+
     case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
+      switch (n)
+	{
+	default:
+	  if (dest == reg)
+	    {
+#ifdef __AARCH64EB__
+	      dest += 8;
+#endif
+	    }
+	  else
+	    *(double *)dest = *(double *)reg;
+	  break;
+	case 2:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "st2 { v16.d, v17.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+	  break;
+	case 3:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldr q18, [%1, #32]\n\t"
+	      "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+	  break;
+	case 4:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldp q18, q19, [%1, #32]\n\t"
+	      "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+	  break;
+	}
       break;
+
     case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
-	return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+      if (dest != reg)
+	return memcpy (dest, reg, 16 * n);
       break;
+
     default:
       FFI_ASSERT (0);
     }
-
-    return allocate_to_stack (state, stack, alignment, size);
+  return dest;
 }

-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+/* Either allocate an appropriate register for the argument type, or if
+   none are available, allocate a stack slot and return a pointer
+   to the allocated space.  */

-static void
-copy_to_register_or_stack (struct call_context *context,
-			   unsigned char *stack,
-			   struct arg_state *state,
-			   void *value,
-			   unsigned short type)
+static void *
+allocate_int_to_reg_or_stack (struct call_context *context,
+			      struct arg_state *state,
+			      void *stack, size_t size)
 {
-  copy_basic_type (
-	  allocate_to_register_or_stack (context, stack, state, type),
-	  value,
-	  type);
+  if (state->ngrn < N_X_ARG_REG)
+    return &context->x[state->ngrn++];
+
+  state->ngrn = N_X_ARG_REG;
+  return allocate_to_stack (state, stack, size, size);
 }

 /* Marshall the arguments from FFI representation to procedure call
@@ -694,15 +536,21 @@ static unsigned
 aarch64_prep_args (struct call_context *context, unsigned char *stack,
 		   extended_cif *ecif)
 {
-  int i;
+  ffi_cif *cif = ecif->cif;
+  void **avalue = ecif->avalue;
+  int i, nargs = cif->nargs;
   struct arg_state state;

-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+  arg_init (&state, cif->bytes);

-  for (i = 0; i < ecif->cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
+      ffi_type *ty = cif->arg_types[i];
+      size_t s = ty->size;
+      int h, t = ty->type;
+      void *a = avalue[i];
+
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
@@ -710,82 +558,114 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,

 	/* If the argument is a basic type the argument is allocated to an
 	   appropriate register, or if none are available, to the stack.  */
-	case FFI_TYPE_FLOAT:
-	case FFI_TYPE_DOUBLE:
-	case FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
-	case FFI_TYPE_INT:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  copy_to_register_or_stack (context, stack, &state,
-				     ecif->avalue[i], ty->type);
+	case FFI_TYPE_POINTER:
+	do_pointer:
+	  {
+	    ffi_arg ext = extend_integer_type (a, t);
+	    if (state.ngrn < N_X_ARG_REG)
+	      context->x[state.ngrn++] = ext;
+	    else
+	      {
+		void *d = allocate_to_stack (&state, stack, ty->alignment, s);
+		state.ngrn = N_X_ARG_REG;
+		/* Note that the default abi extends each argument
+		   to a full 64-bit slot, while the iOS abi allocates
+		   only enough space. */
+#ifdef __APPLE__
+		memcpy(d, a, s);
+#else
+		*(ffi_arg *)d = ext;
+#endif
+	      }
+	  }
 	  break;

-	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
-					stack, &state);
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* If the argument is a composite type that is larger than 16
-		 bytes, then the argument has been copied to memory, and
-		 the argument is replaced by a pointer to the copy.  */
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  /* Scalar float is a degenerate case of HFA.  */
+	  h = t + 0x100;
+	  goto do_hfa;

-	      copy_to_register_or_stack (context, stack, &state,
-					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      /* If the argument is a composite type and the size in
-		 double-words is not more than the number of available
-		 X registers, then the argument is copied into consecutive
-		 X registers.  */
-	      int j;
-	      for (j = 0; j < (ty->size + 7) / 8; j++)
-		{
-		  memcpy (allocate_to_x (context, &state),
-			  &(((UINT64 *) ecif->avalue[i])[j]),
-			  sizeof (UINT64));
+	case FFI_TYPE_STRUCT:
+	  {
+	    void *dest;
+	    int elems;
+
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+	    do_hfa:
+		elems = h >> 8;
+	        if (state.nsrn + elems <= N_V_ARG_REG)
+		  {
+		    dest = &context->v[state.nsrn];
+		    state.nsrn += elems;
+		    extend_hfa_type (dest, a, h);
+		    break;
+		  }
+		state.nsrn = N_V_ARG_REG;
+		dest = allocate_to_stack (&state, stack, ty->alignment, s);
+	      }
+	    else if (s > 16)
+	      {
+		/* If the argument is a composite type that is larger than 16
+		   bytes, then the argument has been copied to memory, and
+		   the argument is replaced by a pointer to the copy.  */
+		a = &avalue[i];
+		t = FFI_TYPE_POINTER;
+		goto do_pointer;
+	      }
+	    else
+	      {
+		size_t n = (s + 7) / 8;
+		if (state.ngrn + n <= N_X_ARG_REG)
+		  {
+		    /* If the argument is a composite type and the size in
+		       double-words is not more than the number of available
+		       X registers, then the argument is copied into
+		       consecutive X registers.  */
+		    dest = &context->x[state.ngrn];
+		    state.ngrn += n;
+		  }
+		else
+		  {
+		    /* Otherwise, there are insufficient X registers. Further
+		       X register allocations are prevented, the NSAA is
+		       adjusted and the argument is copied to memory at the
+		       adjusted NSAA.  */
+		    state.ngrn = N_X_ARG_REG;
+		    dest = allocate_to_stack (&state, stack, ty->alignment, s);
+		  }
 		}
-	    }
-	  else
-	    {
-	      /* Otherwise, there are insufficient X registers. Further X
-		 register allocations are prevented, the NSAA is adjusted
-		 (by allocate_to_stack ()) and the argument is copied to
-		 memory at the adjusted NSAA.  */
-	      state.ngrn = N_X_ARG_REG;
-
-	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
-					 ty->size), ecif->avalue[i], ty->size);
+	      memcpy (dest, a, s);
 	    }
 	  break;

 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}

 #if defined (__APPLE__)
-      if (i + 1 == ecif->cif->aarch64_nfixedargs)
+      if (i + 1 == cif->aarch64_nfixedargs)
 	{
 	  state.ngrn = N_X_ARG_REG;
 	  state.nsrn = N_V_ARG_REG;
-
 	  state.allocating_variadic = 1;
 	}
 #endif
     }

-  return ecif->cif->aarch64_flags;
+  return cif->aarch64_flags;
 }

 ffi_status
@@ -846,94 +726,61 @@ void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
-  int h;
+  struct call_context context;
+  size_t stack_bytes;
+  int h, t;

   ecif.cif = cif;
   ecif.avalue = avalue;
   ecif.rvalue = rvalue;

-  switch (cif->abi)
+  stack_bytes = cif->bytes;
+
+  memset (&context, 0, sizeof (context));
+  if (is_register_candidate (cif->rtype))
     {
-    case FFI_SYSV:
-      {
-        struct call_context context;
-	size_t stack_bytes;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);

-	/* Figure out the total amount of stack space we need, the
-	   above call frame space needs to be 16 bytes aligned to
-	   ensure correct alignment of the first object inserted in
-	   that space hence the ALIGN applied to cif->bytes.*/
-	stack_bytes = ALIGN(cif->bytes, 16);
+      t = cif->rtype->type;
+      switch (t)
+	{
+	case FFI_TYPE_INT:
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_POINTER:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	  *(ffi_arg *)rvalue = extend_integer_type (&context.x[0], t);
+	  break;

-	memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
-		{
-		  void *addr = get_basic_type_addr (cif->rtype->type,
-						    &context, 0);
-		  copy_basic_type (rvalue, addr, cif->rtype->type);
-		  break;
-		}
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  compress_hfa_type (rvalue, &context.v[0], 0x100 + t);
+	  break;

-              case FFI_TYPE_STRUCT:
-		h = is_hfa (cif->rtype);
-                if (h)
-		  {
-		    int j;
-		    int type = h & 0xff;
-		    int elems = h >> 8;
-		    for (j = 0; j < elems; j++)
-		      {
-			void *reg = get_basic_type_addr (type, &context, j);
-			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
-		      }
-		  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    size_t size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-	    context.x8 = (uintptr_t)rvalue;
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-			   stack_bytes, fn);
-          }
-        break;
-      }
+	case FFI_TYPE_STRUCT:
+	  h = is_hfa (cif->rtype);
+	  if (h)
+	    compress_hfa_type (rvalue, &context.v[0], h);
+	  else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
+	    memcpy (rvalue, &context.x[0], cif->rtype->size);
+	  else
+	    abort();
+	  break;

-    default:
-      FFI_ASSERT (0);
-      break;
+	default:
+	  abort();
+	}
+    }
+  else
+    {
+      context.x8 = (uintptr_t)rvalue;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
     }
 }

@@ -1000,203 +847,158 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   void *rvalue = NULL;
-  int i, h;
+  int i, h, nargs = cif->nargs;
   struct arg_state state;
+  ffi_type *rtype;

   arg_init (&state, ALIGN(cif->bytes, 16));

-  for (i = 0; i < cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
+      int t = ty->type;
+      size_t n, s = ty->size;

-      switch (ty->type)
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
 	  break;

+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_INT:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
+	case FFI_TYPE_POINTER:
+	  avalue[i] = allocate_int_to_reg_or_stack (context, &state, stack, s);
+	  break;
+
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
-	  avalue[i] = allocate_to_register_or_stack (context, stack,
-						     &state, ty->type);
-	  break;
+	  /* Scalar float is a degenerate case of HFA.  */
+	  h = t + 0x100;
+	  goto do_hfa;

 	case FFI_TYPE_STRUCT:
 	  h = is_hfa (ty);
 	  if (h)
 	    {
-	      unsigned n = h >> 8;
-	      if (available_v (&state) < n)
+	    do_hfa:
+	      n = h >> 8;
+	      if (state.nsrn + n <= N_V_ARG_REG)
 		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-						 ty->size);
+		  void *reg = &context->v[state.nsrn];
+		  state.nsrn += n;
+
+		  /* Eeek! We need a pointer to the structure, however the
+		     homogeneous float elements are being passed in individual
+		     registers, therefore for float and double the structure
+		     is not represented as a contiguous sequence of bytes in
+		     our saved register context.  We don't need the original
+		     contents of the register storage, so we reformat the
+		     structure into the same memory.  */
+		  avalue[i] = compress_hfa_type (reg, reg, h);
 		}
 	      else
 		{
-		  switch (h & 0xff)
-		    {
-		    case FFI_TYPE_FLOAT:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure laid out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			unsigned j;
-			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < n; j++)
-			  memcpy (&p[j],
-				  allocate_to_s (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_DOUBLE:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure laid out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			unsigned j;
-			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < n; j++)
-			  memcpy (&p[j],
-				  allocate_to_d (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_LONGDOUBLE:
-			  memcpy (&avalue[i],
-				  allocate_to_v (context, &state),
-				  sizeof (*avalue));
-		      break;
-
-		    default:
-		      FFI_ASSERT (0);
-		      break;
-		    }
+		  state.nsrn = N_V_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
 		}
 	    }
-	  else if (ty->size > 16)
+	  else if (s > 16)
 	    {
 	      /* Replace Composite type of size greater than 16 with a
 		 pointer.  */
-	      memcpy (&avalue[i],
-		      allocate_to_register_or_stack (context, stack,
-						     &state, FFI_TYPE_POINTER),
-		      sizeof (avalue[i]));
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      avalue[i] = get_x_addr (context, state.ngrn);
-	      state.ngrn += (ty->size + 7) / 8;
+	      avalue[i] = *(void **)
+		allocate_int_to_reg_or_stack (context, &state, stack,
+					      sizeof (void *));
 	    }
 	  else
 	    {
-	      state.ngrn = N_X_ARG_REG;
-
-	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-					     ty->size);
+	      n = (s + 7) / 8;
+	      if (state.ngrn + n <= N_X_ARG_REG)
+		{
+		  avalue[i] = &context->x[state.ngrn];
+		  state.ngrn += n;
+		}
+	      else
+		{
+		  state.ngrn = N_X_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
+		}
 	    }
 	  break;

 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}
     }

-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
-
-  if (is_register_candidate (cif->rtype))
+  /* Figure out where the return value will be passed, either in registers
+     or in a memory block allocated by the caller and passed in x8.  */
+  rtype = cif->rtype;
+  if (is_register_candidate (rtype))
     {
+      size_t s = rtype->size;
+      int t;
+
       /* Register candidates are *always* returned in registers. */

       /* Allocate a scratchpad for the return value, we will let the
          callee scrible the result into the scratch pad then move the
          contents into the appropriate return value location for the
          call convention.  */
-      rvalue = alloca (cif->rtype->size);
+      rvalue = alloca (s);
       (closure->fun) (cif, rvalue, avalue, closure->user_data);

       /* Copy the return value into the call context so that it is returned
          as expected to our caller.  */
-      switch (cif->rtype->type)
+      t = rtype->type;
+      switch (t)
         {
         case FFI_TYPE_VOID:
           break;

+        case FFI_TYPE_INT:
         case FFI_TYPE_UINT8:
         case FFI_TYPE_UINT16:
         case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
         case FFI_TYPE_UINT64:
         case FFI_TYPE_SINT8:
         case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
         case FFI_TYPE_SINT32:
         case FFI_TYPE_SINT64:
+        case FFI_TYPE_POINTER:
+	  context->x[0] = extend_integer_type (rvalue, t);
+          break;
+
         case FFI_TYPE_FLOAT:
         case FFI_TYPE_DOUBLE:
         case FFI_TYPE_LONGDOUBLE:
-	  {
-	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-	    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-	  }
+	  extend_hfa_type (&context->v[0], rvalue, 0x100 + t);
+	  break;
+
         case FFI_TYPE_STRUCT:
 	  h = is_hfa (cif->rtype);
           if (h)
-	    {
-	      int j;
-	      int type = h & 0xff;
-	      int elems = h >> 8;
-	      for (j = 0; j < elems; j++)
-		{
-		  void *reg = get_basic_type_addr (type, context, j);
-		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
-		}
-	    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              size_t size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
+	    extend_hfa_type (&context->v[0], rvalue, h);
           else
-            {
-              FFI_ASSERT (0);
+	    {
+	      FFI_ASSERT (s <= 16);
+              memcpy (&context->x[0], rvalue, s);
             }
           break;
+
         default:
-          FFI_ASSERT (0);
-          break;
+          abort();
         }
     }
   else
--
2.7.4.huawei.3