Remove a large swath of unmaintained emoji regex generation code

2025-05-01 20:07:19 +09:00 · 2019-04-03 17:53:39 -07:00 · 2019-04-03 17:53:39 -07:00 · 6f4fd350d7
commit 6f4fd350d7
parent 0e43130c2d
1 changed files with 6 additions and 445 deletions
--- a/2/scripts/generate
+++ b/2/scripts/generate
@ -15,450 +15,7 @@ function file(which) {
  return path.join(__dirname, '../..', which);
 }
-// Twitter assets by property name
+function createTwemoji() {
 var assets = {
  '2/72x72': [],
  '2/svg': []
 };
 var skinToneOptions = [
  '\\ud83c\\udffb',
  '\\ud83c\\udffc',
  '\\ud83c\\udffd',
  '\\ud83c\\udffe',
  '\\ud83c\\udfff'
 ];
 // white spaces we don't want to catch via the RegExp
 // there is no asset equivalent for these
 var ignoreMissing = ['2002', '2003', '2005'];
 // Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
 // items get sorted by length (long to short), then unicode hex values (low to high)
 // output is "or" ed together using | for regex
 // ouput also combines adjacent items using character classes with ranges when they have common prefixes
 // Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
 function generateRegexPartial(items) {
  var currentPrefix = null;
  var result = [];
  var charClass = [];
  var charRange = [];
  items.map(function (item) {
    // Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
    return item.split('\\u').slice(1);
  }).sort(sortMethod).forEach(function (itemParts) {
    var prefix = itemParts.slice(0, -1).join('\\u');
    if (prefix) {
      prefix = '\\u' + prefix;
    }
    var suffix = itemParts.slice(-1);
    if (prefix !== currentPrefix) {
      flushCharClass();
    }
    currentPrefix = prefix;
    var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
    if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
      flushCharRange();
    }
    charRange.push('\\u' + suffix);
  });
  flushCharClass();
  return result.join('|');
  // a and b are arrays of hex UCS-2 units
  function sortMethod(a, b) {
    return !a.length ? 0 :
      b.length - a.length ||
      parseInt(a[0], 16) - parseInt(b[0], 16) ||
      sortMethod(a.slice(1), b.slice(1)
    );
  }
  function flushCharRange() {
    charClass = charClass.concat((charRange.length < 3) ?
      charRange :
      [ charRange[0], '-', charRange.slice(-1)[0] ]
    );
    charRange = [];
  }
  function flushCharClass() {
    flushCharRange();
    if (charClass.length) {
      result.push(currentPrefix + (charClass.length == 1 ?
        charClass[0] :
        '[' + charClass.join('') + ']'
      ));
    }
    charClass = [];
    currentPrefix = null;
  }
 }
 // basic utility to organize async code
 // see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html
 // or:  http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html
 function Queue(args, f) {
  setTimeout(args.next = function next() {
    return (f = args.shift()) ? !!f(args) || !0 : !1;
  }, 0);
  return args;
 }
 // main task
 Queue([
  // will populate assets arrays
  function grabAllAssets(q) {
    console.log('analyzing all assets ... ');
    // per each path/folder
    Object.keys(assets).forEach(function (path, i, paths) {
      // grab all files in that folder
      fs.readdir(file(path), function (err, files) {
        // and add them to the assets path
        assets[path].push.apply(
          assets[path],
          files.map(upperCaseWithoutExtension)
        );
        // once all assets arrays have been populated
        if (paths.reduce(completed, true)) {
          console.log('[INFO] assets contains ' + assets[path].length + ' emoji.');
          q.next();
        }
      });
    });
    // drop extension + uppercase
    function upperCaseWithoutExtension(file) {
      return file.slice(0, file.lastIndexOf('.')).toUpperCase();
    }
    // returns true if all assets have been populated
    function completed(p, c) {
      return p && assets[c].length;
    }
  },
  // will fetch and store all emoji from unicode.org
  function fetchEmojiSources(q) {
    console.log('fetching EmojiSources.txt ... ');
    // grab all emoji and test them against them
    http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) {
      var chunks = [];
      // if all good ...
      if (res.statusCode === 200) {
        // grab all data
        res.on('data', chunks.push.bind(chunks));
        // once done ...
        res.on('end', function () {
          console.log('analyzing EmojiSources VS our assets ... ');
          // store all missing assets in one object
          var missing = {};
          // will be used to store an array with all missing
          var missingGrouped = {};
          // will be needed later on
          // parse it, clean it, and store it once
          q.emojiSource = chunks
            .join('')
            .split(/\r\n|\r|\n/)
            // filter once
            .filter(function (line) {
              return this.test(line);
            }, /^[0-9A-F]/)
            // take only emoji info
            .map(function (codePoint) {
              return codePoint
                .slice(0, codePoint.indexOf(';'))
                .toUpperCase()
                // drop spaces
                .replace(/\s+/g, '-')
                // drop 0 padded prefixes
                .replace(/^0+/g, '');
            });
          console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.');
          // find out which one is missing from our assets
          q.emojiSource.forEach(
            function (emoji) {
              // do not loop for emoji we know we should ignore
              if (ignoreMissing.indexOf(emoji) < 0) {
                // verify all others per each folder
                this.forEach(function (path) {
                  if (assets[path].indexOf(emoji) < 0) {
                    (missing[path] || (missing[path] = [])).push(emoji);
                    missingGrouped[emoji] = true;
                  }
                });
              }
            },
            // and per each folder
            Object.keys(assets)
          );
          // if some missing emoji has been found
          if (Object.keys(missing).length) {
            // warn and show which one is missing
            console.warn('[WARNING] missing assets for:');
            console.log(missing);
          }
          // create the array of all emoji we should ignore
          q.ignore = ignoreMissing.concat(Object.keys(missingGrouped));
          q.next();
        });
      } else {
        console.error('[ERROR] unable to fetch emoji at unicode.org');
        process.exit(1);
      }
    });
  },
  // grab the list of emoji that behave differently when
  // variants such \uFE0E and \uFE0F are in place
  function grabStandardVariants(q) {
    console.log('fetching StandardizedVariants.txt ... ');
    http.get(
      "http://unicode.org/Public/UNIDATA/StandardizedVariants.txt",
      function(res) {
        var chunks = [];
        if (res.statusCode == 200) {
          res.on('data', chunks.push.bind(chunks));
          res.on('end', function () {
            // cleaning up parsing sensitive emoji
            q.variantsSensitive = chunks
              .join('')                         // all content
              .split(/\r\n|\r|\n/)              // split in lines
              .filter(function (line) {         // containing FE0E; info
                return this.test(line);         // avoiding duplicated with FE0F
              }, / FE0E; text style/)
              .map(function (line) {            // cleaned up to grab
                return line.replace(this, '$1') // only first unicode
                        .toUpperCase();         // normalized as uppercase
              }, /^([0-9A-F]{4,}) FE0E;.+$/)    // sensitive char
            ;
            // iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?)
            q.variantsSensitive.push('002A');
            // iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?)
            q.variantsSensitive.push('2639');
            console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.');
            q.next();
          });
        } else {
          console.error('[ERROR] unable to fetch standard variants at unicode.org');
          process.exit(1);
        }
      }
    );
  },
  // add our own assets that are not part of the Unicode standard
  function addMissingEmoji(q) {
    q.nonStandard = [];
    Object.keys(assets).forEach(function (path, i) {
      assets[path].forEach(function (emoji) {
        if (
          q.emojiSource.indexOf(emoji) < 0 &&
          q.nonStandard.indexOf(emoji) < 0
        ) {
          q.nonStandard.push(emoji);
        }
      });
    });
    if (q.nonStandard.length) {
      console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:');
      // console.log(q.nonStandard.join(', '));
    }
    q.emojiSource = q.emojiSource.concat(q.nonStandard)
    q.next();
  },
  // detect complete sets of five skin tones and a base
  function detectDiversityEmoji(q) {
    var isPresent = {};
    q.emojiSource.forEach(function (codePoints) {
      isPresent[codePoints] = true;
    });
    q.diversityBase = q.emojiSource.filter(function (codePoints) {
      // Start with the set of Emoji with the light skin tone
      return /-1F3FB$/.test(codePoints);
    }).map(function (codePoints) {
      // Take the skin tone off
      return codePoints.replace(/-1F3FB$/, '');
    }).filter(function (baseCodePoints) {
      // Verify that all other skin tones + no skin tone are present
      return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) {
        return isPresent[baseCodePoints + suffix];
      });
    });
    console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.');
    q.next();
  },
  // detect complete sets of five skin tones and a base
  function partitionEmojiTypes(q) {
    console.log('partitioning emoji into types');
    q.zwj = [];
    q.diversity = [];
    q.sensitive = [];
    q.sensitiveKeycaps = [];
    q.diversitySensitive = [];
    q.regular = [];
    q.emojiSource.forEach(function (codePoints) {
      var u;
      var codePointsWithoutKeycap;
      codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) {
        // Pad all hex numbers to have at least 4 digits to match variantsSensitive
        return hex.length < 4 ? ('000' + hex).slice(-4) : hex;
      });
      if (q.ignore.indexOf(codePoints) < 0) {
        u = Utils.toJSON(codePoints);
        codePointsWithoutKeycap = codePoints.replace(/-20E3$/, '');
        if (codePoints.indexOf('200D') >= 0) {
          q.zwj.push(u);
        } else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) {
          q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap));
        } else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) {
          // This is a diversity Emoji with or without a skin tone modifier
          // Add it to the regex if this is the base without the modifier
          if (q.diversityBase.indexOf(codePoints) >= 0) {
            if (q.variantsSensitive.indexOf(codePoints) < 0) {
              q.diversity.push(u);
            } else {
              q.diversitySensitive.push(u);
            }
          }
        } else if (q.variantsSensitive.indexOf(codePoints) < 0) {
          q.regular.push(u);
        } else {
          q.sensitive.push(u);
        }
      }
    });
    q.next();
  },
  function factorZwjSequences(q) {
    q.zwjCommonPatterns = [];
    // There are dozens of new ZWJ sequences that have common prefixes or suffixes with
    // skin tone + gender variations. To keep the main regex from growing excessively large and
    // slow, choose some common sub-expressions to factor.
    var commonPatterns = [
      {
        name: 'leading man/woman zwj with optional skin tone',
        re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)',
        numCombinations: 12
      }, {
        name: 'variant or skin tone before trailing female/male zwj',
        re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f',
        numCombinations: 12
      }, {
        name: 'optional skin tone before trailing female/male zwj',
        re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f',
        numCombinations: 12
      }
    ];
    commonPatterns.forEach(function(pattern) {
      var mapOfMatches = {};
      var re = new RegExp('^' + pattern.re + '$');
      q.zwj.forEach(function(jsonString) {
        var rawString = JSON.parse('"' + jsonString + '"');
        var match = rawString.match(re);
        if (match) {
          var key = match[1];
          mapOfMatches[key] = mapOfMatches[key] || [];
          mapOfMatches[key].push(match[0]);
        }
      });
      var replacements = [];
      Object.keys(mapOfMatches).forEach(function(key) {
        var matches = mapOfMatches[key];
        // Only a complete set may be replaced
        if (matches.length === pattern.numCombinations) {
          replacements.push(Utils.UTF162JSON(key));
          // Remove all items in the match set from the original zwj list
          matches.forEach(function(rawString) {
            var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString));
            if (indexToRemove >= 0) {
              q.zwj.splice(indexToRemove, 1);
            }
          });
        }
      });
      if (replacements.length) {
        // Replace the wildcard section of the regex with a regex group of replacements
        var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements));
        q.zwjCommonPatterns.push(re);
        console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name);
      } else {
        console.log('did not find any complete sets of ' + pattern.name);
      }
    });
    q.next();
  },
  // with all info, generate a RegExp that will catch
  // only standard emoji that are present in our assets
  function generateRegExp(q) {
    console.log('generating a RegExp for available assets');
    q.re = '';
    // The Zero-width joiner common patterns, if present, need to come first
    if (q.zwjCommonPatterns.length) {
      q.re += q.zwjCommonPatterns.join('|') + '|';
    }
    // Then the rest of the zwjs
    if (q.zwj.length) {
      q.re += generateRegexPartial(q.zwj) + '|';
    }
    // Group the variant sensitive keycaps
    if (q.sensitiveKeycaps.length) {
      q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
    }
    // Next, add the diversity enabled Emoji that may include a skin tone suffix
    if (q.diversity.length + q.diversitySensitive.length) {
      q.re += '(?:';
      if (q.diversitySensitive.length) {
        // Some diversity are sensitive to variants
        q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
        if (q.diversity.length) {
          q.re += '|';
        }
      }
      q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
    }
    // Next, the normal Emoji
    q.re += generateRegexPartial(q.regular) + '|';
    // Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
    q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
    q.next();
  },
  function generateFile(q) {
    console.log('generating ./twemoji.js');
    createTwemoji(q.re);
    require('./create-dist');
  }
 ]);
 function createTwemoji(re) {
  fs.writeFileSync(
    file('2/twemoji.js'),
    '/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
@ -1042,4 +599,8 @@ function createTwemoji(re) {
        ) +
        '\n  */'
      ) + '());');
 }
 createTwemoji();
 require('./create-dist');