gcc 11 weird bug

Wed Oct 27 09:19:00 GMT 2021

I noticed that mintty did not compile anymore after upgrade to gcc 11, 
but only on cygwin 32-bit.
I tried to minimize the test case as much as possible without having the 
bug vanish, to the attached standalone file.
Compile this with
cc -O2 -Wall -Werror m0.c
and it gives a false positive warning about possible uninitialized data 
usage.
While data flow analysis is not perfect, it is weird that this used to 
happen on 32 bit but not on 64 bit.
Meanwhile, after updating some other packages (not sure which), but 
still the same gcc version, the report on the test case also happens on 
64 bit, while the original, unstripped file, as part of mintty, still 
works without error on 64 bit, which is even weirder.
I have not yet had the opportunity to test this on Linux, sorry, so I'm 
reporting it here.
Thomas
-------------- next part --------------
#include <stdbool.h>
typedef unsigned short wchar;    // UTF-16
typedef unsigned char uchar;
#define lengthof(array) (sizeof(array) / sizeof(*(array)))

#define when break; case
#define or : case
#define otherwise break; default

typedef unsigned int ucschar;

typedef struct {
} bidi_char;

/* bidi classes (Unicode: PropertyValueAliases.txt) */
enum {
  L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, AN, CS, NSM, BN, B, S, WS, ON,
  LRI, RLI, FSI, PDI
};

#define leastGreaterOdd(x) ( ((x)+1) | 1 )
#define leastGreaterEven(x) ( ((x)+2) &~ 1 )

static bool
is_NI(uchar bc)
{
  return 1 & (1 << (bc));
}

/*
 * The Main Bidi Function, and the only function that should
 * be used by the outside world.
 *
 * line: a buffer of size count containing text to apply
 * the Bidirectional algorithm to.
 */
int
do_bidi(bool autodir, int paragraphLevel, bool explicitRTL, bool box_mirror, 
        bidi_char * line, int count)
{
  uchar currentEmbedding;
  uchar currentOverride;
  uchar tempType;
  int i, j;

  uchar bidi_class_of(int i) {
    (void)i;

    if (explicitRTL)
      return R;

    return ON;
  }

 /* Rule (P2), (P3)
  * P2. In each paragraph, find the first character of type L, AL, or R 
    while skipping over any characters between an isolate initiator and 
    its matching PDI or, if it has no matching PDI, the end of the paragraph.
  * P3. If a character is found in P2 and it is of type AL or R, then set
  * the paragraph embedding level to one; otherwise, set it to zero.
  */
  int isolateLevel = 0;
  int resLevel = -1;

 /* Initialize types, levels */
  uchar types[count];
  uchar levels[count];
  (void)levels;

 /* Rule (X1)
    X1. At the beginning of a paragraph, perform the following steps:
  • Set the stack to empty.
  • Push onto the stack an entry consisting of the paragraph embedding level,
    a neutral directional override status, and a false directional isolate status.
  • Set the overflow isolate count to zero.
  • Set the overflow embedding count to zero.
  • Set the valid isolate count to zero.
  • Process each character iteratively, applying rules X2 through X8. 
    Only embedding levels from 0 through max_depth are valid in this phase. 
    (Note that in the resolution of levels in rules I1 and I2, 
    the maximum embedding level of max_depth+1 can be reached.)
  */
  currentEmbedding = paragraphLevel;
  currentOverride = ON;
  bool currentIsolate = false;

  // By making the dss as large as the whole line, we avoid overflow handling.
  uchar dss_emb[count + 1];
  uchar dss_ovr[count + 1];
  bool dss_isol[count + 1];
  int dss_top = -1;

  int countdss() { return dss_top + 1; }

  void pushdss() {
    dss_top++;
    dss_emb[dss_top] = currentEmbedding;
    dss_ovr[dss_top] = currentOverride;
    dss_isol[dss_top] = currentIsolate;
  }

  void popdss() {
    // remove top
    if (dss_top >= 0)
      dss_top--;
    // then set current values to new top
    if (dss_top >= 0) {
      currentEmbedding = dss_emb[dss_top];
      currentOverride = dss_ovr[dss_top];
      currentIsolate = dss_isol[dss_top];
    }
  }

  pushdss();
  //int ovfIsolate = 0;
  //int ovfEmbedding = 0;
  isolateLevel = 0;

 /* Rule (X2), (X3), (X4), (X5), (X6), (X7), (X8)
  * X2. With each RLE, compute the least greater odd embedding level.
  * X3. With each LRE, compute the least greater even embedding level.
  * X4. With each RLO, compute the least greater odd embedding level.
  * X5. With each LRO, compute the least greater even embedding level.
  * X6. For all types besides RLE, LRE, RLO, LRO, and PDF:
  *          a. Set the level of the current character to the current
  *              embedding level.
  *          b. Whenever the directional override status is not neutral,
  *              reset the current character type to the directional
  *              override status.
  * X7. With each PDF, determine the matching embedding or override code.
  * If there was a valid matching code, restore (pop) the last
  * remembered (pushed) embedding level and directional override.
  * X8. All explicit directional embeddings and overrides are completely
  * terminated at the end of each paragraph. Paragraph separators are not
  * included in the embedding. (Useless here) NOT IMPLEMENTED
  */
  for (i = 0; i < count; i++) {
    tempType = bidi_class_of(i);
    levels[i] = currentEmbedding;

    if (tempType == FSI) {
      int lvl = 0;
      tempType = LRI;
      for (int k = i + 1; k < count; k++) {
        uchar kType = bidi_class_of(k);
        if (kType == FSI || kType == RLI || kType == LRI)
          lvl++;
        else if (kType == PDI) {
          if (lvl)
            lvl--;
          else
            break;
        }
        else if (kType == R || kType == AL) {
          tempType = RLI;
          break;
        }
        else if (kType == L)
          break;
      }
    }
    switch (tempType) {
      when RLE:
        currentEmbedding = leastGreaterOdd(currentEmbedding);
        currentOverride = ON;
        currentIsolate = false;
        pushdss();
      when LRE:
        currentEmbedding = leastGreaterEven(currentEmbedding);
        currentOverride = ON;
        currentIsolate = false;
        pushdss();
      when RLO:
        currentEmbedding = leastGreaterOdd(currentEmbedding);
        currentOverride = R;
        currentIsolate = false;
        pushdss();
      when LRO:
        currentEmbedding = leastGreaterEven(currentEmbedding);
        currentOverride = L;
        currentIsolate = false;
        pushdss();
      when RLI:
        if (currentOverride != ON)
          tempType = currentOverride;
        currentEmbedding = leastGreaterOdd(currentEmbedding);
        isolateLevel++;
        currentOverride = ON;
        currentIsolate = true;
        pushdss();
      when LRI:
        if (currentOverride != ON)
          tempType = currentOverride;
        currentEmbedding = leastGreaterEven(currentEmbedding);
        isolateLevel++;
        currentOverride = ON;
        currentIsolate = true;
        pushdss();
      when PDF:
        if (!currentIsolate && countdss() >= 2)
          popdss();
        levels[i] = currentEmbedding;
      when PDI:
        if (isolateLevel) {
          while (!currentIsolate && countdss() > 0)
            popdss();
          popdss();
          isolateLevel--;
        }
        if (currentOverride != ON)
          tempType = currentOverride;
        levels[i] = currentEmbedding;
      when WS or S: /* Whitespace is treated as neutral for now */
        if (currentOverride != ON)
          tempType = currentOverride;
      otherwise:
        if (currentOverride != ON)
          tempType = currentOverride;
    }
    types[i] = tempType;
  }

 /* Rule (X9)
  * X9. Remove all RLE, LRE, RLO, LRO, PDF, and BN codes.
  * Here, they're converted to NSM (used to be BN).
  */
  bool skip[count];
  for (i = 0; i < count; i++) {
    switch (types[i]) {
      when RLE or LRE or RLO or LRO or PDF or BN:
        //types[i] = BN;
        types[i] = NSM;  // fixes 4594 test cases + 28 char test cases
        skip[i] = true;  // remove char from algorithm... (usage incomplete)
      otherwise:
        skip[i] = false;
    }
  }

 /* Rule (W1)
  * W1. Examine each non-spacing mark (NSM) in the level run, and change
  * the type of the NSM to the type of the previous character. If the NSM
  * is at the start of the level run, it will get the type of sor.
  // TODO: check
    W1. Examine each nonspacing mark (NSM) in the isolating run sequence, 
    and change the type of the NSM 
    to Other Neutral if the previous character is an isolate initiator or PDI, 
    and to the type of the previous character otherwise. 
    If the NSM is at the start of the isolating run sequence, 
    it will get the type of sos. 
    (Note that in an isolating run sequence, an isolate initiator followed by 
    an NSM or any type other than PDI must be an overflow isolate initiator.)
  */
  if (types[0] == NSM /*&& !skip[0]*/)
    types[0] = (paragraphLevel & 1) ? R : L;  // sor

  for (i = 1; i < count; i++) {
    if (types[i] == NSM /*&& !skip[i]*/)
      switch (types[i - 1]) {
        when LRI or RLI or FSI or PDI:
          types[i] = ON;
        otherwise:
          types[i] = types[i - 1];
      }
  }

 /* Rule (W4)
  * W4. A single European separator between two European numbers changes
  * to a European number. A single common separator between two numbers
  * of the same type changes to that type.
  */
  for (i = 1; i < count - 1; i++) {
    if (types[i] == ES) {
      if (types[i - 1] == EN && types[i + 1] == EN)
        types[i] = EN;
    }
    else if (types[i] == CS) {
      if (types[i - 1] == EN && types[i + 1] == EN)
        types[i] = EN;
      else if (types[i - 1] == AN && types[i + 1] == AN)
        types[i] = AN;
    }
  }

 /* Rule (W5)
  * W5. A sequence of European terminators adjacent to European numbers
  * changes to all European numbers.
  *
  * Optimization: lots here... else ifs need rearrangement
  */
  for (i = 0; i < count; i++) {
    if (types[i] == ET) {
      if (i > 0 && types[i - 1] == EN) {
        types[i] = EN;
        continue;
      }
      else if (i < count - 1 && types[i + 1] == EN) {
        types[i] = EN;
        continue;
      }
      else if (i < count - 1 && types[i + 1] == ET) {
        j = i;
        while (j < count && types[j] == ET) {
          j++;
        }
        if (types[j] == EN)
          types[i] = EN;
      }
    }
  }

 /* Rule (N1)
  * N1. A sequence of NIs takes the direction of the surrounding
  * strong text if the text on both sides has the same direction.
  * European and Arabic numbers are treated as though they were R.
  // TODO: check
    European and Arabic numbers act as if they were R in terms of their 
    influence on NIs. The start-of-sequence (sos) and end-of-sequence (eos) 
    types are used at isolating run sequence boundaries.
  */
  if (count >= 2 && is_NI(types[0])) {
    if ((paragraphLevel & 1) &&
        ((types[1] == R) || (types[1] == EN) || (types[1] == AN))
       )
      types[0] = R;
    else if (!(paragraphLevel & 1) && types[1] == L)
      types[0] = L;
  }
  for (i = 1; i < count - 1; i++) {
    if (is_NI(types[i])) {
      if (types[i - 1] == L) {
        j = i;
        while (j < count - 1 && is_NI(types[j])) {
          j++;
        }
        if (types[j] == L) {
          while (i < j) {
            types[i] = L;
            i++;
          }
        }

      }
      else if ((types[i - 1] == R) || (types[i - 1] == EN) ||
               (types[i - 1] == AN)) {
        j = i;
        while (j < count - 1 && is_NI(types[j])) {
          j++;
        }
        if ((types[j] == R) || (types[j] == EN) || (types[j] == AN)) {
          while (i < j) {
            types[i] = R;
            i++;
          }
        }
      }
    }
  }
  if (count >= 2 && is_NI(types[count - 1])) {
    if ((paragraphLevel & 1) &&
        (types[count - 2] == R || types[count - 2] == EN || types[count - 2] == AN)
       )
      types[count - 1] = R;
    else if (!(paragraphLevel & 1) && types[count - 2] == L)
      types[count - 1] = L;
  }

 /* Rule (L1)
  * L1. On each line, reset the embedding level of the following characters
  * to the paragraph embedding level:
  *   (1) segment separators,
  *   (2) paragraph separators,
  *   (3) any sequence of whitespace characters or isolate markers
  *       preceding a segment separator or paragraph separator,
  *   (4) and any sequence of whitespace characters or isolate markers
  *       at the end of the line.
  * The types of characters used here are the original types, not those
  * modified by the previous phase.
    N/A: Because a paragraph separator breaks lines, there will be at most 
    one per line, at the end of that line.
  */
  // (4)
  j = count - 1;
  while (j > 0 && (bidi_class_of(j) == WS || skip[j])) {
    j--;
  }
  if (j < count - 1) {
    for (j++; j < count; j++)
      levels[j] = paragraphLevel;
  }
  // (3)
  for (i = 0; i < count; i++) {
    tempType = bidi_class_of(i);
    if (tempType == WS) {
      j = i;
      while (j < count && (bidi_class_of(j) == WS || skip[j])) {
        j++;
      }
      if (j == count || bidi_class_of(j) == B || bidi_class_of(j) == S) {
        for (j--; j >= i; j--) {
          levels[j] = paragraphLevel;
        }
      }
    }
    else if (tempType == B || tempType == S) {
      levels[i] = paragraphLevel;
    }
  }

  return resLevel;
}