mirror of
https://github.com/twitter/twemoji.git
synced 2025-05-01 20:07:19 +09:00
Remove a large swath of unmaintained emoji regex generation code
This commit is contained in:
parent
0e43130c2d
commit
6f4fd350d7
@ -15,450 +15,7 @@ function file(which) {
|
|||||||
return path.join(__dirname, '../..', which);
|
return path.join(__dirname, '../..', which);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Twitter assets by property name
|
function createTwemoji() {
|
||||||
var assets = {
|
|
||||||
'2/72x72': [],
|
|
||||||
'2/svg': []
|
|
||||||
};
|
|
||||||
|
|
||||||
var skinToneOptions = [
|
|
||||||
'\\ud83c\\udffb',
|
|
||||||
'\\ud83c\\udffc',
|
|
||||||
'\\ud83c\\udffd',
|
|
||||||
'\\ud83c\\udffe',
|
|
||||||
'\\ud83c\\udfff'
|
|
||||||
];
|
|
||||||
|
|
||||||
// white spaces we don't want to catch via the RegExp
|
|
||||||
// there is no asset equivalent for these
|
|
||||||
var ignoreMissing = ['2002', '2003', '2005'];
|
|
||||||
|
|
||||||
// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
|
|
||||||
// items get sorted by length (long to short), then unicode hex values (low to high)
|
|
||||||
// output is "or" ed together using | for regex
|
|
||||||
// ouput also combines adjacent items using character classes with ranges when they have common prefixes
|
|
||||||
// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
|
|
||||||
function generateRegexPartial(items) {
|
|
||||||
var currentPrefix = null;
|
|
||||||
var result = [];
|
|
||||||
var charClass = [];
|
|
||||||
var charRange = [];
|
|
||||||
items.map(function (item) {
|
|
||||||
// Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
|
|
||||||
return item.split('\\u').slice(1);
|
|
||||||
}).sort(sortMethod).forEach(function (itemParts) {
|
|
||||||
var prefix = itemParts.slice(0, -1).join('\\u');
|
|
||||||
if (prefix) {
|
|
||||||
prefix = '\\u' + prefix;
|
|
||||||
}
|
|
||||||
var suffix = itemParts.slice(-1);
|
|
||||||
if (prefix !== currentPrefix) {
|
|
||||||
flushCharClass();
|
|
||||||
}
|
|
||||||
currentPrefix = prefix;
|
|
||||||
var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
|
|
||||||
|
|
||||||
if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
|
|
||||||
flushCharRange();
|
|
||||||
}
|
|
||||||
charRange.push('\\u' + suffix);
|
|
||||||
});
|
|
||||||
|
|
||||||
flushCharClass();
|
|
||||||
return result.join('|');
|
|
||||||
|
|
||||||
// a and b are arrays of hex UCS-2 units
|
|
||||||
function sortMethod(a, b) {
|
|
||||||
return !a.length ? 0 :
|
|
||||||
b.length - a.length ||
|
|
||||||
parseInt(a[0], 16) - parseInt(b[0], 16) ||
|
|
||||||
sortMethod(a.slice(1), b.slice(1)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function flushCharRange() {
|
|
||||||
charClass = charClass.concat((charRange.length < 3) ?
|
|
||||||
charRange :
|
|
||||||
[ charRange[0], '-', charRange.slice(-1)[0] ]
|
|
||||||
);
|
|
||||||
charRange = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
function flushCharClass() {
|
|
||||||
flushCharRange();
|
|
||||||
if (charClass.length) {
|
|
||||||
result.push(currentPrefix + (charClass.length == 1 ?
|
|
||||||
charClass[0] :
|
|
||||||
'[' + charClass.join('') + ']'
|
|
||||||
));
|
|
||||||
}
|
|
||||||
charClass = [];
|
|
||||||
currentPrefix = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// basic utility to organize async code
|
|
||||||
// see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html
|
|
||||||
// or: http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html
|
|
||||||
function Queue(args, f) {
|
|
||||||
setTimeout(args.next = function next() {
|
|
||||||
return (f = args.shift()) ? !!f(args) || !0 : !1;
|
|
||||||
}, 0);
|
|
||||||
return args;
|
|
||||||
}
|
|
||||||
|
|
||||||
// main task
|
|
||||||
Queue([
|
|
||||||
|
|
||||||
// will populate assets arrays
|
|
||||||
function grabAllAssets(q) {
|
|
||||||
console.log('analyzing all assets ... ');
|
|
||||||
// per each path/folder
|
|
||||||
Object.keys(assets).forEach(function (path, i, paths) {
|
|
||||||
// grab all files in that folder
|
|
||||||
fs.readdir(file(path), function (err, files) {
|
|
||||||
// and add them to the assets path
|
|
||||||
assets[path].push.apply(
|
|
||||||
assets[path],
|
|
||||||
files.map(upperCaseWithoutExtension)
|
|
||||||
);
|
|
||||||
// once all assets arrays have been populated
|
|
||||||
if (paths.reduce(completed, true)) {
|
|
||||||
console.log('[INFO] assets contains ' + assets[path].length + ' emoji.');
|
|
||||||
q.next();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
// drop extension + uppercase
|
|
||||||
function upperCaseWithoutExtension(file) {
|
|
||||||
return file.slice(0, file.lastIndexOf('.')).toUpperCase();
|
|
||||||
}
|
|
||||||
// returns true if all assets have been populated
|
|
||||||
function completed(p, c) {
|
|
||||||
return p && assets[c].length;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
// will fetch and store all emoji from unicode.org
|
|
||||||
function fetchEmojiSources(q) {
|
|
||||||
console.log('fetching EmojiSources.txt ... ');
|
|
||||||
// grab all emoji and test them against them
|
|
||||||
http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) {
|
|
||||||
var chunks = [];
|
|
||||||
// if all good ...
|
|
||||||
if (res.statusCode === 200) {
|
|
||||||
// grab all data
|
|
||||||
res.on('data', chunks.push.bind(chunks));
|
|
||||||
// once done ...
|
|
||||||
res.on('end', function () {
|
|
||||||
console.log('analyzing EmojiSources VS our assets ... ');
|
|
||||||
// store all missing assets in one object
|
|
||||||
var missing = {};
|
|
||||||
// will be used to store an array with all missing
|
|
||||||
var missingGrouped = {};
|
|
||||||
|
|
||||||
// will be needed later on
|
|
||||||
// parse it, clean it, and store it once
|
|
||||||
q.emojiSource = chunks
|
|
||||||
.join('')
|
|
||||||
.split(/\r\n|\r|\n/)
|
|
||||||
// filter once
|
|
||||||
.filter(function (line) {
|
|
||||||
return this.test(line);
|
|
||||||
}, /^[0-9A-F]/)
|
|
||||||
// take only emoji info
|
|
||||||
.map(function (codePoint) {
|
|
||||||
return codePoint
|
|
||||||
.slice(0, codePoint.indexOf(';'))
|
|
||||||
.toUpperCase()
|
|
||||||
// drop spaces
|
|
||||||
.replace(/\s+/g, '-')
|
|
||||||
// drop 0 padded prefixes
|
|
||||||
.replace(/^0+/g, '');
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.');
|
|
||||||
|
|
||||||
// find out which one is missing from our assets
|
|
||||||
q.emojiSource.forEach(
|
|
||||||
function (emoji) {
|
|
||||||
// do not loop for emoji we know we should ignore
|
|
||||||
if (ignoreMissing.indexOf(emoji) < 0) {
|
|
||||||
// verify all others per each folder
|
|
||||||
this.forEach(function (path) {
|
|
||||||
if (assets[path].indexOf(emoji) < 0) {
|
|
||||||
(missing[path] || (missing[path] = [])).push(emoji);
|
|
||||||
missingGrouped[emoji] = true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
// and per each folder
|
|
||||||
Object.keys(assets)
|
|
||||||
);
|
|
||||||
|
|
||||||
// if some missing emoji has been found
|
|
||||||
if (Object.keys(missing).length) {
|
|
||||||
// warn and show which one is missing
|
|
||||||
console.warn('[WARNING] missing assets for:');
|
|
||||||
console.log(missing);
|
|
||||||
}
|
|
||||||
// create the array of all emoji we should ignore
|
|
||||||
q.ignore = ignoreMissing.concat(Object.keys(missingGrouped));
|
|
||||||
|
|
||||||
q.next();
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
console.error('[ERROR] unable to fetch emoji at unicode.org');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
},
|
|
||||||
|
|
||||||
// grab the list of emoji that behave differently when
|
|
||||||
// variants such \uFE0E and \uFE0F are in place
|
|
||||||
function grabStandardVariants(q) {
|
|
||||||
console.log('fetching StandardizedVariants.txt ... ');
|
|
||||||
http.get(
|
|
||||||
"http://unicode.org/Public/UNIDATA/StandardizedVariants.txt",
|
|
||||||
function(res) {
|
|
||||||
var chunks = [];
|
|
||||||
if (res.statusCode == 200) {
|
|
||||||
res.on('data', chunks.push.bind(chunks));
|
|
||||||
res.on('end', function () {
|
|
||||||
// cleaning up parsing sensitive emoji
|
|
||||||
q.variantsSensitive = chunks
|
|
||||||
.join('') // all content
|
|
||||||
.split(/\r\n|\r|\n/) // split in lines
|
|
||||||
.filter(function (line) { // containing FE0E; info
|
|
||||||
return this.test(line); // avoiding duplicated with FE0F
|
|
||||||
}, / FE0E; text style/)
|
|
||||||
.map(function (line) { // cleaned up to grab
|
|
||||||
return line.replace(this, '$1') // only first unicode
|
|
||||||
.toUpperCase(); // normalized as uppercase
|
|
||||||
}, /^([0-9A-F]{4,}) FE0E;.+$/) // sensitive char
|
|
||||||
;
|
|
||||||
|
|
||||||
// iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?)
|
|
||||||
q.variantsSensitive.push('002A');
|
|
||||||
// iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?)
|
|
||||||
q.variantsSensitive.push('2639');
|
|
||||||
|
|
||||||
console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.');
|
|
||||||
q.next();
|
|
||||||
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
console.error('[ERROR] unable to fetch standard variants at unicode.org');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
},
|
|
||||||
|
|
||||||
// add our own assets that are not part of the Unicode standard
|
|
||||||
function addMissingEmoji(q) {
|
|
||||||
q.nonStandard = [];
|
|
||||||
Object.keys(assets).forEach(function (path, i) {
|
|
||||||
assets[path].forEach(function (emoji) {
|
|
||||||
if (
|
|
||||||
q.emojiSource.indexOf(emoji) < 0 &&
|
|
||||||
q.nonStandard.indexOf(emoji) < 0
|
|
||||||
) {
|
|
||||||
q.nonStandard.push(emoji);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
if (q.nonStandard.length) {
|
|
||||||
console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:');
|
|
||||||
// console.log(q.nonStandard.join(', '));
|
|
||||||
}
|
|
||||||
|
|
||||||
q.emojiSource = q.emojiSource.concat(q.nonStandard)
|
|
||||||
q.next();
|
|
||||||
},
|
|
||||||
|
|
||||||
// detect complete sets of five skin tones and a base
|
|
||||||
function detectDiversityEmoji(q) {
|
|
||||||
var isPresent = {};
|
|
||||||
q.emojiSource.forEach(function (codePoints) {
|
|
||||||
isPresent[codePoints] = true;
|
|
||||||
});
|
|
||||||
q.diversityBase = q.emojiSource.filter(function (codePoints) {
|
|
||||||
// Start with the set of Emoji with the light skin tone
|
|
||||||
return /-1F3FB$/.test(codePoints);
|
|
||||||
}).map(function (codePoints) {
|
|
||||||
// Take the skin tone off
|
|
||||||
return codePoints.replace(/-1F3FB$/, '');
|
|
||||||
}).filter(function (baseCodePoints) {
|
|
||||||
// Verify that all other skin tones + no skin tone are present
|
|
||||||
return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) {
|
|
||||||
return isPresent[baseCodePoints + suffix];
|
|
||||||
});
|
|
||||||
});
|
|
||||||
console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.');
|
|
||||||
q.next();
|
|
||||||
},
|
|
||||||
|
|
||||||
// detect complete sets of five skin tones and a base
|
|
||||||
function partitionEmojiTypes(q) {
|
|
||||||
console.log('partitioning emoji into types');
|
|
||||||
q.zwj = [];
|
|
||||||
q.diversity = [];
|
|
||||||
q.sensitive = [];
|
|
||||||
q.sensitiveKeycaps = [];
|
|
||||||
q.diversitySensitive = [];
|
|
||||||
q.regular = [];
|
|
||||||
q.emojiSource.forEach(function (codePoints) {
|
|
||||||
var u;
|
|
||||||
var codePointsWithoutKeycap;
|
|
||||||
codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) {
|
|
||||||
// Pad all hex numbers to have at least 4 digits to match variantsSensitive
|
|
||||||
return hex.length < 4 ? ('000' + hex).slice(-4) : hex;
|
|
||||||
});
|
|
||||||
if (q.ignore.indexOf(codePoints) < 0) {
|
|
||||||
u = Utils.toJSON(codePoints);
|
|
||||||
codePointsWithoutKeycap = codePoints.replace(/-20E3$/, '');
|
|
||||||
if (codePoints.indexOf('200D') >= 0) {
|
|
||||||
q.zwj.push(u);
|
|
||||||
} else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) {
|
|
||||||
q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap));
|
|
||||||
} else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) {
|
|
||||||
// This is a diversity Emoji with or without a skin tone modifier
|
|
||||||
// Add it to the regex if this is the base without the modifier
|
|
||||||
if (q.diversityBase.indexOf(codePoints) >= 0) {
|
|
||||||
if (q.variantsSensitive.indexOf(codePoints) < 0) {
|
|
||||||
q.diversity.push(u);
|
|
||||||
} else {
|
|
||||||
q.diversitySensitive.push(u);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (q.variantsSensitive.indexOf(codePoints) < 0) {
|
|
||||||
q.regular.push(u);
|
|
||||||
} else {
|
|
||||||
q.sensitive.push(u);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
q.next();
|
|
||||||
},
|
|
||||||
|
|
||||||
function factorZwjSequences(q) {
|
|
||||||
q.zwjCommonPatterns = [];
|
|
||||||
|
|
||||||
// There are dozens of new ZWJ sequences that have common prefixes or suffixes with
|
|
||||||
// skin tone + gender variations. To keep the main regex from growing excessively large and
|
|
||||||
// slow, choose some common sub-expressions to factor.
|
|
||||||
var commonPatterns = [
|
|
||||||
{
|
|
||||||
name: 'leading man/woman zwj with optional skin tone',
|
|
||||||
re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)',
|
|
||||||
numCombinations: 12
|
|
||||||
}, {
|
|
||||||
name: 'variant or skin tone before trailing female/male zwj',
|
|
||||||
re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f',
|
|
||||||
numCombinations: 12
|
|
||||||
}, {
|
|
||||||
name: 'optional skin tone before trailing female/male zwj',
|
|
||||||
re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f',
|
|
||||||
numCombinations: 12
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
commonPatterns.forEach(function(pattern) {
|
|
||||||
var mapOfMatches = {};
|
|
||||||
var re = new RegExp('^' + pattern.re + '$');
|
|
||||||
q.zwj.forEach(function(jsonString) {
|
|
||||||
var rawString = JSON.parse('"' + jsonString + '"');
|
|
||||||
var match = rawString.match(re);
|
|
||||||
if (match) {
|
|
||||||
var key = match[1];
|
|
||||||
mapOfMatches[key] = mapOfMatches[key] || [];
|
|
||||||
mapOfMatches[key].push(match[0]);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
var replacements = [];
|
|
||||||
Object.keys(mapOfMatches).forEach(function(key) {
|
|
||||||
var matches = mapOfMatches[key];
|
|
||||||
// Only a complete set may be replaced
|
|
||||||
if (matches.length === pattern.numCombinations) {
|
|
||||||
replacements.push(Utils.UTF162JSON(key));
|
|
||||||
// Remove all items in the match set from the original zwj list
|
|
||||||
matches.forEach(function(rawString) {
|
|
||||||
var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString));
|
|
||||||
if (indexToRemove >= 0) {
|
|
||||||
q.zwj.splice(indexToRemove, 1);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if (replacements.length) {
|
|
||||||
// Replace the wildcard section of the regex with a regex group of replacements
|
|
||||||
var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements));
|
|
||||||
q.zwjCommonPatterns.push(re);
|
|
||||||
console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name);
|
|
||||||
} else {
|
|
||||||
console.log('did not find any complete sets of ' + pattern.name);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
q.next();
|
|
||||||
},
|
|
||||||
|
|
||||||
// with all info, generate a RegExp that will catch
|
|
||||||
// only standard emoji that are present in our assets
|
|
||||||
function generateRegExp(q) {
|
|
||||||
console.log('generating a RegExp for available assets');
|
|
||||||
q.re = '';
|
|
||||||
|
|
||||||
// The Zero-width joiner common patterns, if present, need to come first
|
|
||||||
if (q.zwjCommonPatterns.length) {
|
|
||||||
q.re += q.zwjCommonPatterns.join('|') + '|';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Then the rest of the zwjs
|
|
||||||
if (q.zwj.length) {
|
|
||||||
q.re += generateRegexPartial(q.zwj) + '|';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Group the variant sensitive keycaps
|
|
||||||
if (q.sensitiveKeycaps.length) {
|
|
||||||
q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next, add the diversity enabled Emoji that may include a skin tone suffix
|
|
||||||
if (q.diversity.length + q.diversitySensitive.length) {
|
|
||||||
q.re += '(?:';
|
|
||||||
if (q.diversitySensitive.length) {
|
|
||||||
// Some diversity are sensitive to variants
|
|
||||||
q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
|
|
||||||
if (q.diversity.length) {
|
|
||||||
q.re += '|';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next, the normal Emoji
|
|
||||||
q.re += generateRegexPartial(q.regular) + '|';
|
|
||||||
|
|
||||||
// Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
|
|
||||||
q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
|
|
||||||
q.next();
|
|
||||||
},
|
|
||||||
|
|
||||||
function generateFile(q) {
|
|
||||||
console.log('generating ./twemoji.js');
|
|
||||||
createTwemoji(q.re);
|
|
||||||
require('./create-dist');
|
|
||||||
}
|
|
||||||
|
|
||||||
]);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function createTwemoji(re) {
|
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
file('2/twemoji.js'),
|
file('2/twemoji.js'),
|
||||||
'/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
|
'/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
|
||||||
@ -1042,4 +599,8 @@ function createTwemoji(re) {
|
|||||||
) +
|
) +
|
||||||
'\n */'
|
'\n */'
|
||||||
) + '());');
|
) + '());');
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
createTwemoji();
|
||||||
|
require('./create-dist');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user